Skip to content

Commit c8a4c7d

Browse files
authored
Data catalog done (#3021)
* adding XML to a public AP that had no documentation. * adding a traintest split sample. Small corrections to the images doc.xml.
1 parent 807d813 commit c8a4c7d

File tree

4 files changed

+154
-20
lines changed

4 files changed

+154
-20
lines changed

docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/BootstrapSample.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
namespace Microsoft.ML.Samples.Dynamic
55
{
6-
public static class Bootstrap
6+
public static class BootstrapSample
77
{
88
public static void Example()
99
{
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Collections.Immutable;
4+
using System.Linq;
5+
using Microsoft.ML.Data;
6+
using static Microsoft.ML.DataOperationsCatalog;
7+
8+
namespace Microsoft.ML.Samples.Dynamic
9+
{
10+
/// <summary>
11+
/// Sample class showing how to use TrainTestSplit.
12+
/// </summary>
13+
public static class TrainTestSplit
14+
{
15+
public static void Example()
16+
{
17+
// Creating the ML.Net IHostEnvironment object, needed for the pipeline.
18+
var mlContext = new MLContext();
19+
20+
// Generate some data points.
21+
var examples = GenerateRandomDataPoints(10);
22+
23+
// Convert the examples list to an IDataView object, which is consumable by ML.NET API.
24+
var dataview = mlContext.Data.LoadFromEnumerable(examples);
25+
26+
// Leave out 10% of the dataset for testing.For some types of problems, for example for ranking or anomaly detection,
27+
// we must ensure that the split leaves the rows with the same value in a particular column, in one of the splits.
28+
// So below, we specify Group column as the column containing the sampling keys.
29+
// Notice how keeping the rows with the same value in the Group column overrides the testFraction definition.
30+
TrainTestData split = mlContext.Data.TrainTestSplit(dataview, testFraction: 0.1, samplingKeyColumnName: "Group");
31+
32+
PrintPreviewRows(split);
33+
34+
// The data in the Train split.
35+
// [Group, 1], [Features, 0.8173254]
36+
// [Group, 1], [Features, 0.5581612]
37+
// [Group, 1], [Features, 0.5588848]
38+
// [Group, 1], [Features, 0.4421779]
39+
// [Group, 1], [Features, 0.2737045]
40+
41+
// The data in the Test split.
42+
// [Group, 0], [Features, 0.7262433]
43+
// [Group, 0], [Features, 0.7680227]
44+
// [Group, 0], [Features, 0.2060332]
45+
// [Group, 0], [Features, 0.9060271]
46+
// [Group, 0], [Features, 0.9775497]
47+
48+
// Example of a split without specifying a sampling key column.
49+
split = mlContext.Data.TrainTestSplit(dataview, testFraction: 0.2);
50+
PrintPreviewRows(split);
51+
52+
// The data in the Train split.
53+
// [Group, 0], [Features, 0.7262433]
54+
// [Group, 1], [Features, 0.8173254]
55+
// [Group, 0], [Features, 0.7680227]
56+
// [Group, 1], [Features, 0.5581612]
57+
// [Group, 0], [Features, 0.2060332]
58+
// [Group, 1], [Features, 0.4421779]
59+
// [Group, 0], [Features, 0.9775497]
60+
// [Group, 1], [Features, 0.2737045]
61+
62+
// The data in the Test split.
63+
// [Group, 1], [Features, 0.5588848]
64+
// [Group, 0], [Features, 0.9060271]
65+
66+
}
67+
68+
private static IEnumerable<DataPoint> GenerateRandomDataPoints(int count, int seed = 0)
69+
{
70+
var random = new Random(seed);
71+
for (int i = 0; i < count; i++)
72+
{
73+
yield return new DataPoint
74+
{
75+
Group = i % 2,
76+
77+
// Create random features that are correlated with label.
78+
Features = (float)random.NextDouble()
79+
};
80+
}
81+
}
82+
83+
// Example with label and group column. A data set is a collection of such examples.
84+
private class DataPoint
85+
{
86+
public float Group { get; set; }
87+
88+
public float Features { get; set; }
89+
}
90+
91+
// print helper
92+
private static void PrintPreviewRows(TrainTestData split)
93+
{
94+
95+
var trainDataPreview = split.TrainSet.Preview();
96+
var testDataPreview = split.TestSet.Preview();
97+
98+
Console.WriteLine($"The data in the Train split.");
99+
foreach (var row in trainDataPreview.RowView)
100+
Console.WriteLine($"{row.Values[0]}, {row.Values[1]}");
101+
102+
Console.WriteLine($"\nThe data in the Test split.");
103+
foreach (var row in testDataPreview.RowView)
104+
Console.WriteLine($"{row.Values[0]}, {row.Values[1]}");
105+
}
106+
}
107+
}

src/Microsoft.ML.Data/DataLoadSave/DataOperationsCatalog.cs

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ internal DataOperationsCatalog(IHostEnvironment env)
7070
/// <example>
7171
/// <format type="text/markdown">
7272
/// <![CDATA[
73-
/// [!code-csharp[BootstrapSample](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/DataViewEnumerable.cs)]
73+
/// [!code-csharp[LoadFromEnumerable](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/DataViewEnumerable.cs)]
7474
/// ]]>
7575
/// </format>
7676
/// </example>
@@ -82,6 +82,25 @@ public IDataView LoadFromEnumerable<TRow>(IEnumerable<TRow> data, SchemaDefiniti
8282
return DataViewConstructionUtils.CreateFromEnumerable(_env, data, schemaDefinition);
8383
}
8484

85+
/// <summary>
86+
/// Create a new <see cref="IDataView"/> over an enumerable of the items of user-defined type, and the provided <see cref="DataViewSchema"/>
87+
/// which might contain more information about the schema than the type can capture.
88+
/// </summary>
89+
/// <remarks>
90+
/// The user maintains ownership of the <paramref name="data"/> and the resulting data view will
91+
/// never alter the contents of the <paramref name="data"/>.
92+
/// Since <see cref="IDataView"/> is assumed to be immutable, the user is expected to support
93+
/// multiple enumeration of the <paramref name="data"/> that would return the same results, unless
94+
/// the user knows that the data will only be cursored once.
95+
/// One typical usage for streaming data view could be: create the data view that lazily loads data
96+
/// as needed, then apply pre-trained transformations to it and cursor through it for transformation
97+
/// results.
98+
/// One practical usage of this would be to supply the feature column names through the <see cref="DataViewSchema.Annotations"/>.
99+
/// </remarks>
100+
/// <typeparam name="TRow">The <typeparamref name="TRow"/> to convert to an <see cref="IDataView"/>.</typeparam>
101+
/// <param name="data">The data with <typeparamref name="TRow"/> to convert to an <see cref="IDataView"/>.</param>
102+
/// <param name="schema">The schema of the returned <see cref="IDataView"/>.</param>
103+
/// <returns>An <see cref="IDataView"/> with the given <paramref name="schema"/>.</returns>
85104
public IDataView LoadFromEnumerable<TRow>(IEnumerable<TRow> data, DataViewSchema schema)
86105
where TRow : class
87106
{
@@ -102,7 +121,7 @@ public IDataView LoadFromEnumerable<TRow>(IEnumerable<TRow> data, DataViewSchema
102121
/// <example>
103122
/// <format type="text/markdown">
104123
/// <![CDATA[
105-
/// [!code-csharp[BootstrapSample](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/DataViewEnumerable.cs)]
124+
/// [!code-csharp[CreateEnumerable](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/DataViewEnumerable.cs)]
106125
/// ]]>
107126
/// </format>
108127
/// </example>
@@ -381,6 +400,13 @@ public IDataView TakeRows(IDataView input, long count)
381400
/// they are guaranteed to appear in the same subset (train or test). This can be used to ensure no label leakage from the train to the test set.
382401
/// If <see langword="null"/> no row grouping will be performed.</param>
383402
/// <param name="seed">Seed for the random number generator used to select rows for the train-test split.</param>
403+
/// <example>
404+
/// <format type="text/markdown">
405+
/// <![CDATA[
406+
/// [!code-csharp[TrainTestSplit](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/TrainTestSplit.cs)]
407+
/// ]]>
408+
/// </format>
409+
/// </example>
384410
public TrainTestData TrainTestSplit(IDataView data, double testFraction = 0.1, string samplingKeyColumnName = null, int? seed = null)
385411
{
386412
_env.CheckValue(data, nameof(data));

src/Microsoft.ML.ImageAnalytics/doc.xml

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,15 @@
88
</summary>
99
<remarks>
1010
<format type="text/markdown">
11-
<![CDATA[
12-
13-
## Remarks
14-
The images might be converted to grayscale to reduce the complexity of the model.
15-
The grayed out images contain less information to process than the colored images.
16-
Another use case for converting to grayscale is to generate new images out of the existing ones, so you can have a larger dataset,
17-
a technique known as [data augmentation](http://www.stat.harvard.edu/Faculty_Content/meng/JCGS01.pdf)</a>.
18-
19-
For end-to-end image processing pipelines, and scenarios in your applications, see the
20-
[examples in the machinelearning-samples github repository](https://github.com/dotnet/machinelearning-samples/tree/master/samples/csharp/getting-started).</a>
11+
<![CDATA[
12+
## Remarks
13+
The images might be converted to grayscale to reduce the complexity of the model.
14+
The grayed out images contain less information to process than the colored images.
15+
Another use case for converting to grayscale is to generate new images out of the existing ones, so you can have a larger dataset,
16+
a technique known as [data augmentation](http://www.stat.harvard.edu/Faculty_Content/meng/JCGS01.pdf).
17+
18+
For end-to-end image processing pipelines, and scenarios in your applications, see the
19+
[examples in the machinelearning-samples github repository](https://github.com/dotnet/machinelearning-samples/tree/master/samples/csharp/getting-started).
2120
]]></format>
2221
<seealso cref="ImageEstimatorsCatalog" />
2322
<seealso cref="ImageLoadingEstimator"/>
@@ -31,10 +30,11 @@
3130
<remarks>
3231
The ImagePixelExtractingEstimator extracts the pixels from the input images and, converts them into a vector of numbers.
3332
This can be further used as feature by the algorithms added to the pipeline.
34-
35-
ImagePixelExtractingEstimator expects a <see cref="ImageResizingEstimator"/> in the pipeline, before it is used.
36-
For end-to-end image processing pipelines, and scenarios in your applications, see the
37-
<a href="https://github.com/dotnet/machinelearning-samples/tree/master/samples/csharp/getting-started">examples in the machinelearning-samples github repository</a>.
33+
<para>
34+
ImagePixelExtractingEstimator expects a <see cref="ImageResizingEstimator"/> in the pipeline, before it is used.
35+
For end-to-end image processing pipelines, and scenarios in your applications, see the
36+
<a href="https://github.com/dotnet/machinelearning-samples/tree/master/samples/csharp/getting-started">examples in the machinelearning-samples github repository</a>.
37+
</para>
3838
<seealso cref="ImageEstimatorsCatalog" />
3939
<seealso cref="ImageLoadingEstimator"/>
4040
<seealso cref="ImageResizingEstimator"/>
@@ -50,9 +50,10 @@
5050
extract features for usage in the machine learning algorithms.
5151
Those pre-trained models have a defined width and height for their input images, so often, after getting loaded, the images will need to get resized before
5252
further processing.
53-
54-
For end-to-end image processing pipelines, and scenarios in your applications, see the
55-
<a href="https://github.com/dotnet/machinelearning-samples/tree/master/samples/csharp/getting-started">examples in the machinelearning-samples github repository.</a>
53+
<para>
54+
For end-to-end image processing pipelines, and scenarios in your applications, see the
55+
<a href="https://github.com/dotnet/machinelearning-samples/tree/master/samples/csharp/getting-started">examples in the machinelearning-samples github repository.</a>
56+
</para>
5657
<seealso cref="ImageEstimatorsCatalog" />
5758
<seealso cref="ImageLoadingEstimator"/>
5859
</remarks>

0 commit comments

Comments
 (0)