Data catalog done (#3021)

sfilipi · web-flow · commit c8a4c7dec32a · 2019-03-20T10:21:11.000-07:00
* adding XML to a public AP that had no documentation.

* adding a traintest split sample. Small corrections to the images doc.xml.
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/BootstrapSample.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/BootstrapSample.cs
@@ -3,7 +3,7 @@
 
 namespace Microsoft.ML.Samples.Dynamic
 {
-    public static class Bootstrap
+    public static class BootstrapSample
     {
         public static void Example()
         {
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/TrainTestSplit.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/TrainTestSplit.cs
@@ -0,0 +1,107 @@
+﻿using System;
+using System.Collections.Generic;
+using System.Collections.Immutable;
+using System.Linq;
+using Microsoft.ML.Data;
+using static Microsoft.ML.DataOperationsCatalog;
+
+namespace Microsoft.ML.Samples.Dynamic
+{
+    /// <summary>
+    /// Sample class showing how to use TrainTestSplit.
+    /// </summary>
+    public static class TrainTestSplit
+    {
+        public static void Example()
+        {
+            // Creating the ML.Net IHostEnvironment object, needed for the pipeline.
+            var mlContext = new MLContext();
+
+            // Generate some data points.
+            var examples = GenerateRandomDataPoints(10);
+
+            // Convert the examples list to an IDataView object, which is consumable by ML.NET API.
+            var dataview = mlContext.Data.LoadFromEnumerable(examples);
+
+            // Leave out 10% of the dataset for testing.For some types of problems, for example for ranking or anomaly detection,
+            // we must ensure that the split leaves the rows with the same value in a particular column, in one of the splits. 
+            // So below, we specify Group column as the column containing the sampling keys.
+            // Notice how keeping the rows with the same value in the Group column overrides the testFraction definition. 
+            TrainTestData split = mlContext.Data.TrainTestSplit(dataview, testFraction: 0.1, samplingKeyColumnName: "Group");
+
+            PrintPreviewRows(split);
+
+            //  The data in the Train split.
+            //  [Group, 1], [Features, 0.8173254]
+            //  [Group, 1], [Features, 0.5581612]
+            //  [Group, 1], [Features, 0.5588848]
+            //  [Group, 1], [Features, 0.4421779]
+            //  [Group, 1], [Features, 0.2737045]
+
+            //  The data in the Test split.
+            //  [Group, 0], [Features, 0.7262433]
+            //  [Group, 0], [Features, 0.7680227]
+            //  [Group, 0], [Features, 0.2060332]
+            //  [Group, 0], [Features, 0.9060271]
+            //  [Group, 0], [Features, 0.9775497]
+
+            // Example of a split without specifying a sampling key column.
+            split = mlContext.Data.TrainTestSplit(dataview, testFraction: 0.2);
+            PrintPreviewRows(split);
+
+            // The data in the Train split.
+            // [Group, 0], [Features, 0.7262433]
+            // [Group, 1], [Features, 0.8173254]
+            // [Group, 0], [Features, 0.7680227]
+            // [Group, 1], [Features, 0.5581612]
+            // [Group, 0], [Features, 0.2060332]
+            // [Group, 1], [Features, 0.4421779]
+            // [Group, 0], [Features, 0.9775497]
+            // [Group, 1], [Features, 0.2737045]
+
+            // The data in the Test split.
+            // [Group, 1], [Features, 0.5588848]
+            // [Group, 0], [Features, 0.9060271]
+
+        }
+
+        private static IEnumerable<DataPoint> GenerateRandomDataPoints(int count, int seed = 0)
+        {
+            var random = new Random(seed);
+            for (int i = 0; i < count; i++)
+            {
+                yield return new DataPoint
+                {
+                    Group = i % 2,
+
+                    // Create random features that are correlated with label.
+                    Features = (float)random.NextDouble()
+                };
+            }
+        }
+
+        // Example with label and group column. A data set is a collection of such examples.
+        private class DataPoint
+        {
+            public float Group { get; set; }
+
+            public float Features { get; set; }
+        }
+
+        // print helper
+        private static void PrintPreviewRows(TrainTestData split)
+        {
+
+            var trainDataPreview = split.TrainSet.Preview();
+            var testDataPreview = split.TestSet.Preview();
+
+            Console.WriteLine($"The data in the Train split.");
+            foreach (var row in trainDataPreview.RowView)
+                Console.WriteLine($"{row.Values[0]}, {row.Values[1]}");
+
+            Console.WriteLine($"\nThe data in the Test split.");
+            foreach (var row in testDataPreview.RowView)
+                Console.WriteLine($"{row.Values[0]}, {row.Values[1]}");
+        }
+    }
+}
diff --git a/src/Microsoft.ML.Data/DataLoadSave/DataOperationsCatalog.cs b/src/Microsoft.ML.Data/DataLoadSave/DataOperationsCatalog.cs
@@ -70,7 +70,7 @@ internal DataOperationsCatalog(IHostEnvironment env)
         /// <example>
         /// <format type="text/markdown">
         /// <![CDATA[
-        /// [!code-csharp[BootstrapSample](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/DataViewEnumerable.cs)]
+        /// [!code-csharp[LoadFromEnumerable](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/DataViewEnumerable.cs)]
         /// ]]>
         /// </format>
         /// </example>
@@ -82,6 +82,25 @@ public IDataView LoadFromEnumerable<TRow>(IEnumerable<TRow> data, SchemaDefiniti
             return DataViewConstructionUtils.CreateFromEnumerable(_env, data, schemaDefinition);
         }
 
+        /// <summary>
+        /// Create a new <see cref="IDataView"/> over an enumerable of the items of user-defined type, and the provided <see cref="DataViewSchema"/>
+        /// which might contain more information about the schema than the type can capture.
+        /// </summary>
+        /// <remarks>
+        /// The user maintains ownership of the <paramref name="data"/> and the resulting data view will
+        /// never alter the contents of the <paramref name="data"/>.
+        /// Since <see cref="IDataView"/> is assumed to be immutable, the user is expected to support
+        /// multiple enumeration of the <paramref name="data"/> that would return the same results, unless
+        /// the user knows that the data will only be cursored once.
+        /// One typical usage for streaming data view could be: create the data view that lazily loads data
+        /// as needed, then apply pre-trained transformations to it and cursor through it for transformation
+        /// results.
+        /// One practical usage of this would be to supply the feature column names through the <see cref="DataViewSchema.Annotations"/>.
+        /// </remarks>
+        /// <typeparam name="TRow">The <typeparamref name="TRow"/> to convert to an <see cref="IDataView"/>.</typeparam>
+        /// <param name="data">The data with <typeparamref name="TRow"/> to convert to an <see cref="IDataView"/>.</param>
+        /// <param name="schema">The schema of the returned <see cref="IDataView"/>.</param>
+        /// <returns>An <see cref="IDataView"/> with the given <paramref name="schema"/>.</returns>
         public IDataView LoadFromEnumerable<TRow>(IEnumerable<TRow> data, DataViewSchema schema)
             where TRow : class
         {
@@ -102,7 +121,7 @@ public IDataView LoadFromEnumerable<TRow>(IEnumerable<TRow> data, DataViewSchema
         /// <example>
         /// <format type="text/markdown">
         /// <![CDATA[
-        /// [!code-csharp[BootstrapSample](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/DataViewEnumerable.cs)]
+        /// [!code-csharp[CreateEnumerable](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/DataViewEnumerable.cs)]
         /// ]]>
         /// </format>
         /// </example>
@@ -381,6 +400,13 @@ public IDataView TakeRows(IDataView input, long count)
         /// they are guaranteed to appear in the same subset (train or test). This can be used to ensure no label leakage from the train to the test set.
         /// If <see langword="null"/> no row grouping will be performed.</param>
         /// <param name="seed">Seed for the random number generator used to select rows for the train-test split.</param>
+        /// <example>
+        /// <format type="text/markdown">
+        /// <![CDATA[
+        /// [!code-csharp[TrainTestSplit](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/TrainTestSplit.cs)]
+        /// ]]>
+        /// </format>
+        /// </example>
         public TrainTestData TrainTestSplit(IDataView data, double testFraction = 0.1, string samplingKeyColumnName = null, int? seed = null)
         {
             _env.CheckValue(data, nameof(data));
diff --git a/src/Microsoft.ML.ImageAnalytics/doc.xml b/src/Microsoft.ML.ImageAnalytics/doc.xml
@@ -8,16 +8,15 @@
       </summary>
       <remarks>
         <format type="text/markdown">
-            <![CDATA[ 
-
-          ## Remarks 
-          The images might be converted to grayscale to reduce the complexity of the model.
-          The grayed out images contain less information to process than the colored images.
-          Another use case for converting to grayscale is to generate new images out of the existing ones, so you can have a larger dataset,        
-          a technique known as [data augmentation](http://www.stat.harvard.edu/Faculty_Content/meng/JCGS01.pdf)</a>.
-          
-          For end-to-end image processing pipelines, and scenarios in your applications, see the 
-          [examples in the machinelearning-samples github repository](https://github.com/dotnet/machinelearning-samples/tree/master/samples/csharp/getting-started).</a>
+         <![CDATA[
+## Remarks 
+The images might be converted to grayscale to reduce the complexity of the model.
+The grayed out images contain less information to process than the colored images.
+Another use case for converting to grayscale is to generate new images out of the existing ones, so you can have a larger dataset,        
+a technique known as [data augmentation](http://www.stat.harvard.edu/Faculty_Content/meng/JCGS01.pdf).
+         
+For end-to-end image processing pipelines, and scenarios in your applications, see the 
+[examples in the machinelearning-samples github repository](https://github.com/dotnet/machinelearning-samples/tree/master/samples/csharp/getting-started).
         ]]></format>
         <seealso cref="ImageEstimatorsCatalog" />
         <seealso cref="ImageLoadingEstimator"/>
@@ -31,10 +30,11 @@
       <remarks>
         The ImagePixelExtractingEstimator extracts the pixels from the input images and, converts them into a vector of numbers.
         This can be further used as feature by the algorithms added to the pipeline.
-
-        ImagePixelExtractingEstimator expects a <see cref="ImageResizingEstimator"/>  in the pipeline, before it is used.
-        For end-to-end image processing pipelines, and scenarios in your applications, see the
-        <a href="https://github.com/dotnet/machinelearning-samples/tree/master/samples/csharp/getting-started">examples in the machinelearning-samples github repository</a>.
+        <para>
+          ImagePixelExtractingEstimator expects a <see cref="ImageResizingEstimator"/>  in the pipeline, before it is used.
+          For end-to-end image processing pipelines, and scenarios in your applications, see the
+          <a href="https://github.com/dotnet/machinelearning-samples/tree/master/samples/csharp/getting-started">examples in the machinelearning-samples github repository</a>.
+        </para>
         <seealso cref="ImageEstimatorsCatalog" />
         <seealso cref="ImageLoadingEstimator"/>
         <seealso cref="ImageResizingEstimator"/>
@@ -50,9 +50,10 @@
         extract features for usage in the machine learning algorithms.
         Those pre-trained models have a defined width and height for their input images, so often, after getting loaded, the images will need to get resized before
         further processing.
-
-        For end-to-end image processing pipelines, and scenarios in your applications, see the
-        <a href="https://github.com/dotnet/machinelearning-samples/tree/master/samples/csharp/getting-started">examples in the machinelearning-samples github repository.</a>
+        <para>
+          For end-to-end image processing pipelines, and scenarios in your applications, see the
+          <a href="https://github.com/dotnet/machinelearning-samples/tree/master/samples/csharp/getting-started">examples in the machinelearning-samples github repository.</a>
+        </para>
         <seealso cref="ImageEstimatorsCatalog" />
         <seealso cref="ImageLoadingEstimator"/>
       </remarks>

Original file line number	Diff line number	Diff line change
`@@ -3,7 +3,7 @@`
`3`	`3`
`4`	`4`	`namespace Microsoft.ML.Samples.Dynamic`
`5`	`5`	`{`
`6`		`- public static class Bootstrap`
	`6`	`+ public static class BootstrapSample`
`7`	`7`	`{`
`8`	`8`	`public static void Example()`
`9`	`9`	`{`