Skip to content

Commit d51321c

Browse files
authored
switch housing dataset to wine (dotnet#170)
* replace housing uci dataset to wine quality
1 parent 76393f4 commit d51321c

File tree

6 files changed

+68
-17
lines changed

6 files changed

+68
-17
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -328,3 +328,5 @@ ASALocalRun/
328328

329329
# MSBuild Binary and Structured Log
330330
*.binlog
331+
# Ignore external test datasets.
332+
/test/data/external/

build.proj

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
<Import Project="$([MSBuild]::GetDirectoryNameOfFileAbove($(MSBuildThisFileDirectory), Directory.Build.props))\Directory.Build.props" />
99

1010
<Import Project="$(ToolsDir)VersionTools.targets" Condition="Exists('$(ToolsDir)VersionTools.targets')" />
11-
11+
<UsingTask TaskName="DownloadFilesFromUrl" AssemblyFile="$(ToolsDir)Microsoft.DotNet.Build.Tasks.dll"/>
1212
<PropertyGroup>
1313
<!-- To disable the restoration of packages, set RestoreDuringBuild=false or pass /p:RestoreDuringBuild=false.-->
1414
<RestoreDuringBuild Condition="'$(RestoreDuringBuild)'==''">true</RestoreDuringBuild>
@@ -33,6 +33,7 @@
3333
RestoreProjects;
3434
BuildNative;
3535
$(TraversalBuildDependsOn);
36+
DownloadExternalTestFiles;
3637
RunTests;
3738
</TraversalBuildDependsOn>
3839
</PropertyGroup>
@@ -56,13 +57,26 @@
5657
<ItemGroup>
5758
<PkgProject Include="pkg\**\*.nupkgproj" />
5859
</ItemGroup>
59-
60+
6061
<MSBuild Projects="@(PkgProject)"
6162
Targets="Restore" />
6263
<MSBuild Projects="@(PkgProject)"
6364
Targets="Pack" />
6465
</Target>
6566

67+
<ItemGroup>
68+
<TestFile Include="$(MSBuildThisFileDirectory)/test/data/external/winequality-white.csv"
69+
Url="https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"
70+
DestinationFile="$(MSBuildThisFileDirectory)test/data/external/winequality-white.csv" />
71+
</ItemGroup>
72+
73+
<Target Name="DownloadExternalTestFiles" Inputs="@(TestFile)" Outputs="%(TestFile.DestinationFile)">
74+
<Message Importance="High" Text="Downloading external test files... %(TestFile.DestinationFile)" />
75+
<DownloadFilesFromUrl Items="@(TestFile)"
76+
DestinationDir="test/data/external"
77+
TreatErrorsAsWarnings="true"/>
78+
</Target>
79+
6680
<Target Name="RunTests" Condition="'$(RunTests)'=='true'">
6781
<MSBuild Projects="test\run-tests.proj"
6882
Targets="RunTests" />

test/Microsoft.ML.Core.Tests/UnitTests/TestCSharpApi.cs

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
using ML = Microsoft.ML;
66
using Microsoft.ML.Runtime;
7+
using Microsoft.ML.Data;
78
using Microsoft.ML.Runtime.Data;
89
using Microsoft.ML.Runtime.EntryPoints;
910
using Microsoft.ML.TestFramework;
@@ -269,10 +270,10 @@ public void TestCrossValidationBinaryMacro()
269270
}
270271
}
271272

272-
[Fact(Skip = "Missing data set. See https://github.com/dotnet/machinelearning/issues/3")]
273+
[Fact]
273274
public void TestCrossValidationMacro()
274275
{
275-
var dataPath = GetDataPath(@"housing.txt");
276+
var dataPath = GetDataPath(TestDatasets.winequality.trainFilename);
276277
using (var env = new TlcEnvironment())
277278
{
278279
var subGraph = env.CreateExperiment();
@@ -295,7 +296,30 @@ public void TestCrossValidationMacro()
295296
var modelCombineOutput = subGraph.Add(modelCombine);
296297

297298
var experiment = env.CreateExperiment();
298-
var importInput = new ML.Data.TextLoader(dataPath);
299+
var importInput = new ML.Data.TextLoader(dataPath)
300+
{
301+
Arguments = new TextLoaderArguments
302+
{
303+
Separator = new[] { ';' },
304+
HasHeader = true,
305+
Column = new[]
306+
{
307+
new TextLoaderColumn()
308+
{
309+
Name = "Label",
310+
Source = new [] { new TextLoaderRange(11) },
311+
Type = DataKind.Num
312+
},
313+
314+
new TextLoaderColumn()
315+
{
316+
Name = "Features",
317+
Source = new [] { new TextLoaderRange(0,10) },
318+
Type = DataKind.Num
319+
}
320+
}
321+
}
322+
};
299323
var importOutput = experiment.Add(importInput);
300324

301325
var crossValidate = new ML.Models.CrossValidator
@@ -324,7 +348,7 @@ public void TestCrossValidationMacro()
324348
Assert.True(b);
325349
double val = 0;
326350
getter(ref val);
327-
Assert.Equal(3.32, val, 1);
351+
Assert.Equal(0.58, val, 1);
328352
b = cursor.MoveNext();
329353
Assert.False(b);
330354
}

test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -910,7 +910,7 @@ public void EntryPointTextToKeyToText()
910910
}
911911

912912
private void RunTrainScoreEvaluate(string learner, string evaluator, string dataPath, string warningsPath, string overallMetricsPath,
913-
string instanceMetricsPath, string confusionMatrixPath = null)
913+
string instanceMetricsPath, string confusionMatrixPath = null, string loader = null)
914914
{
915915
string inputGraph = string.Format(@"
916916
{{
@@ -919,6 +919,7 @@ private void RunTrainScoreEvaluate(string learner, string evaluator, string data
919919
'Name': 'Data.CustomTextLoader',
920920
'Inputs': {{
921921
'InputFile': '$file'
922+
{8}
922923
}},
923924
'Outputs': {{
924925
'Data': '$AllData'
@@ -978,7 +979,8 @@ private void RunTrainScoreEvaluate(string learner, string evaluator, string data
978979
}}
979980
}}", learner, evaluator, EscapePath(dataPath), EscapePath(warningsPath), EscapePath(overallMetricsPath), EscapePath(instanceMetricsPath),
980981
confusionMatrixPath != null ? ", 'ConfusionMatrix': '$ConfusionMatrix'" : "",
981-
confusionMatrixPath != null ? string.Format(", 'ConfusionMatrix' : '{0}'", EscapePath(confusionMatrixPath)) : "");
982+
confusionMatrixPath != null ? string.Format(", 'ConfusionMatrix' : '{0}'", EscapePath(confusionMatrixPath)) : "",
983+
string.IsNullOrWhiteSpace(loader) ? "" : string.Format(",'CustomSchema': '{0}'", loader));
982984

983985
var jsonPath = DeleteOutputPath("graph.json");
984986
File.WriteAllLines(jsonPath, new[] { inputGraph });
@@ -1036,15 +1038,16 @@ public void EntryPointEvaluateMultiClass()
10361038
Assert.Equal(3, CountRows(loader));
10371039
}
10381040

1039-
[Fact(Skip = "Missing data set. See https://github.com/dotnet/machinelearning/issues/3")]
1041+
[Fact]
10401042
public void EntryPointEvaluateRegression()
10411043
{
1042-
var dataPath = GetDataPath("housing.txt");
1044+
var dataPath = GetDataPath(TestDatasets.winequality.trainFilename);
10431045
var warningsPath = DeleteOutputPath("warnings.idv");
10441046
var overallMetricsPath = DeleteOutputPath("overall.idv");
10451047
var instanceMetricsPath = DeleteOutputPath("instance.idv");
10461048

1047-
RunTrainScoreEvaluate("Trainers.StochasticDualCoordinateAscentRegressor", "Models.RegressionEvaluator", dataPath, warningsPath, overallMetricsPath, instanceMetricsPath);
1049+
RunTrainScoreEvaluate("Trainers.StochasticDualCoordinateAscentRegressor", "Models.RegressionEvaluator",
1050+
dataPath, warningsPath, overallMetricsPath, instanceMetricsPath, loader: TestDatasets.winequality.loaderSettings);
10481051

10491052
using (var loader = new BinaryLoader(Env, new BinaryLoader.Arguments(), warningsPath))
10501053
Assert.Equal(0, CountRows(loader));
@@ -1053,7 +1056,7 @@ public void EntryPointEvaluateRegression()
10531056
Assert.Equal(1, CountRows(loader));
10541057

10551058
using (var loader = new BinaryLoader(Env, new BinaryLoader.Arguments(), instanceMetricsPath))
1056-
Assert.Equal(104, CountRows(loader));
1059+
Assert.Equal(975, CountRows(loader));
10571060
}
10581061

10591062
[Fact]
@@ -1068,10 +1071,10 @@ public void EntryPointSDCAMultiClass()
10681071
TestEntryPointRoutine("iris.txt", "Trainers.StochasticDualCoordinateAscentClassifier");
10691072
}
10701073

1071-
[Fact(Skip = "Missing data set. See https://github.com/dotnet/machinelearning/issues/3")]
1074+
[Fact()]
10721075
public void EntryPointSDCARegression()
10731076
{
1074-
TestEntryPointRoutine("housing.txt", "Trainers.StochasticDualCoordinateAscentRegressor");
1077+
TestEntryPointRoutine(TestDatasets.winequality.trainFilename, "Trainers.StochasticDualCoordinateAscentRegressor", loader: TestDatasets.winequality.loaderSettings);
10751078
}
10761079

10771080
[Fact]
@@ -1142,10 +1145,10 @@ public void EntryPointHogwildSGD()
11421145
TestEntryPointRoutine("breast-cancer.txt", "Trainers.StochasticGradientDescentBinaryClassifier");
11431146
}
11441147

1145-
[Fact(Skip = "Missing data set. See https://github.com/dotnet/machinelearning/issues/3")]
1148+
[Fact()]
11461149
public void EntryPointPoissonRegression()
11471150
{
1148-
TestEntryPointRoutine("housing.txt", "Trainers.PoissonRegressor");
1151+
TestEntryPointRoutine(TestDatasets.winequality.trainFilename, "Trainers.PoissonRegressor", loader: TestDatasets.winequality.loaderSettings);
11491152
}
11501153

11511154
[Fact]

test/Microsoft.ML.TestFramework/Datasets.cs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,14 @@ public static class TestDatasets
152152
testFilename = "housing.txt"
153153
};
154154

155+
public static TestDataset winequality = new TestDataset
156+
{
157+
name = "wine",
158+
trainFilename = "external/winequality-white.csv",
159+
testFilename = "external/winequality-white.csv",
160+
loaderSettings = "col=Label:R4:11 col=Features:R4:0-10 sep=; header+"
161+
};
162+
155163
public static TestDataset msm = new TestDataset
156164
{
157165
// REVIEW: Why is the MSM train set smaller than the test set? Reverse these!

test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,6 @@
88

99
<ItemGroup>
1010
<NativeAssemblyReference Include="CpuMathNative" />
11-
<NativeAssemblyReference Include="FastTreeNative" />
11+
<NativeAssemblyReference Include="FastTreeNative" />
1212
</ItemGroup>
1313
</Project>

0 commit comments

Comments
 (0)