dotnet · sfilipi · Apr 19, 2019 · Apr 19, 2019 · Apr 19, 2019 · shmoradims
diff --git a/src/Microsoft.ML.Transforms/CountFeatureSelection.cs b/src/Microsoft.ML.Transforms/CountFeatureSelection.cs
@@ -18,7 +18,47 @@
 
 namespace Microsoft.ML.Transforms
 {
-    /// <include file='doc.xml' path='doc/members/member[@name="CountFeatureSelection"]' />
+    /// <summary>
+    ///  Selects the slots for which the count of non-default values is greater than or equal to a threshold.
+    /// </summary>
+    /// <remarks>
+    /// <format type="text/markdown"><![CDATA[
+    ///
+    /// ###  Estimator Characteristics
+    /// |  |  |
+    /// | -- | -- |
+    /// | Does this estimator need to look at the data to train its parameters? | Yes. |
+    /// | Input column data type | Vector or primitive of numeric, text or [key](xref:Microsoft.ML.Data.KeyDataViewType) data types.|
+    /// | Output column data type | Same as the input column.|
+    ///
+    /// This transform uses a set of aggregators to count the number of non-default values for each slot(vector element) and
+    /// instantiates a <xref:Microsoft.ML.Transforms.SlotsDroppingTransformer> to actually drop the slots that have the default values for the types
+    /// keeping at least 'count' non-default values.
+    /// This transform is useful when applied together with a <xref:Microsoft.ML.Transforms.OneHotHashEncodingTransformer>.
+    /// The count feature selection can remove the features generated by the hash transform that have no data in the examples.
+    ///
+    /// For example, if we set the count to 3 and fit the estimator, apply the transformer to the following Features column,
+    /// we would see the second slot, containing: NaN, 5, 5, NaN values being dropped because that slot has only two non-default values, the two 5 values.
+    /// The third slot is being kept, because it has the values 6, 6, 6, NaN; so it has at least 3 non-NaN.
+    ///
+    /// |  Features |
+    /// | -- |
+    /// |4,NaN,6  |
+    /// |4,5,6 |
+    /// |4,5,6 |
+    /// |4,NaN,NaN|
+    ///
+    /// This is how the dataset above would look, after the transformation.
+    ///
+    /// |  Features |
+    /// | -- |
+    /// |4,6  |
+    /// |4,6 |
+    /// |4,6 |
+    /// |4,NaN|
+    ///
+    /// ]]></format>
+    /// </remarks>
     public sealed class CountFeatureSelectingEstimator : IEstimator<ITransformer>
     {
         internal const string Summary = "Selects the slots for which the count of non-default values is greater than or equal to a threshold.";

diff --git a/src/Microsoft.ML.Transforms/FeatureSelectionCatalog.cs b/src/Microsoft.ML.Transforms/FeatureSelectionCatalog.cs
@@ -14,7 +14,9 @@ namespace Microsoft.ML
 
     public static class FeatureSelectionCatalog
     {
-        /// <include file='doc.xml' path='doc/members/member[@name="MutualInformationFeatureSelection"]/*' />
+        /// <summary>
+        /// Create a <see cref="MutualInformationFeatureSelectingEstimator"/>, which selects the top k slots across all specified columns ordered by their mutual information with the label column.
+        /// </summary>
         /// <param name="catalog">The transform's catalog.</param>
         /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
         /// <param name="inputColumnName">Name of column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
@@ -35,7 +37,9 @@ public static MutualInformationFeatureSelectingEstimator SelectFeaturesBasedOnMu
             int numberOfBins = MutualInfoSelectDefaults.NumBins)
             => new MutualInformationFeatureSelectingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumnName, inputColumnName, labelColumnName, slotsInOutput, numberOfBins);
 
-        /// <include file='doc.xml' path='doc/members/member[@name="MutualInformationFeatureSelection"]/*' />
+        /// <summary>
+        /// Create a <see cref="MutualInformationFeatureSelectingEstimator"/>, which selects the top k slots across all specified columns ordered by their mutual information with the label column.
+        /// </summary>
         /// <param name="catalog">The transform's catalog.</param>
         /// <param name="columns">Specifies the names of the input columns for the transformation, and their respective output column names.</param>
         /// <param name="labelColumnName">The name of the label column.</param>
@@ -60,15 +64,19 @@ public static MutualInformationFeatureSelectingEstimator SelectFeaturesBasedOnMu
                 columns.Select(x => (x.OutputColumnName, x.InputColumnName)).ToArray());
         }
 
-        /// <include file='doc.xml' path='doc/members/member[@name="CountFeatureSelection"]' />
+        /// <summary>
+        /// Create a <see cref="CountFeatureSelectingEstimator"/>, which selects the slots for which the count of non-default values is greater than or equal to a threshold.
+        /// </summary>
         /// <param name="catalog">The transform's catalog.</param>
         /// <param name="columns">Describes the parameters of the feature selection process for each column pair.</param>
         [BestFriend]
         internal static CountFeatureSelectingEstimator SelectFeaturesBasedOnCount(this TransformsCatalog.FeatureSelectionTransforms catalog,
             params CountFeatureSelectingEstimator.ColumnOptions[] columns)
             => new CountFeatureSelectingEstimator(CatalogUtils.GetEnvironment(catalog), columns);
 
-        /// <include file='doc.xml' path='doc/members/member[@name="CountFeatureSelection"]' />
+        /// <summary>
+        /// Create a <see cref="CountFeatureSelectingEstimator"/>, which selects the slots for which the count of non-default values is greater than or equal to a threshold.
+        /// </summary>
         /// <param name="catalog">The transform's catalog.</param>
         /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
         /// <param name="inputColumnName">Name of column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
@@ -86,7 +94,9 @@ public static CountFeatureSelectingEstimator SelectFeaturesBasedOnCount(this Tra
             long count = CountSelectDefaults.Count)
             => new CountFeatureSelectingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumnName, inputColumnName, count);
 
-        /// <include file='doc.xml' path='doc/members/member[@name="CountFeatureSelection"]' />
+        /// <summary>
+        /// Create a <see cref="CountFeatureSelectingEstimator"/>, which selects the slots for which the count of non-default values is greater than or equal to a threshold.
+        /// </summary>
         /// <param name="catalog">The transform's catalog.</param>
         /// <param name="columns">Specifies the names of the columns on which to apply the transformation.</param>
         /// <param name="count">If the count of non-default values for a slot is greater than or equal to this threshold in the training data, the slot is preserved.</param>

diff --git a/src/Microsoft.ML.Transforms/MutualInformationFeatureSelection.cs b/src/Microsoft.ML.Transforms/MutualInformationFeatureSelection.cs
@@ -19,7 +19,54 @@
 
 namespace Microsoft.ML.Transforms
 {
-    /// <include file='doc.xml' path='doc/members/member[@name="MutualInformationFeatureSelection"]/*' />
+    /// <summary>
+    /// Selects the top k slots across all specified columns ordered by their mutual information with the label column
+    /// (what you can learn about the label by observing the value of the specified column).
+    /// </summary>
+    /// <remarks>
+    /// <format type="text/markdown"><![CDATA[
+    ///
+    /// ###  Estimator Characteristics
+    /// |  |  |
+    /// | -- | -- |
+    /// | Does this estimator need to look at the data to train its parameters? | Yes. |
+    /// | Input column data type | Vector or primitive of numeric, text or [key](xref:Microsoft.ML.Data.KeyDataViewType) data types.|
+    /// | Output column data type | Same as the input column.|
+    ///
+    /// Formally, the mutual information can be written as:
+    ///
+    /// MI(X,Y) = E[log(P(x,y)) - log(P(x)) - log(P(y))]
+    ///
+    /// where the expectation E is taken over the joint distribution of X and Y.
+    /// Here P(x, y) is the joint probability density function of X and Y, P(x) and P(y) are the marginal probability density functions of X and Y respectively.
+    /// In general, a higher mutual information between the dependent variable(or label) and an independent variable(or feature) means
+    /// that the label has higher mutual dependence over that feature.
+    /// It keeps the top slots in output features with the largest mutual information with the label.
+    ///
+    /// For example, for the following Features and Label column, if we specify that we want the top 2 slots(vector elements) that have the higher correlation
+    /// with the label column, the output of applying this Estimator would keep the first and the third slots only, because their values
+    /// are more correlated with the values in the Label column.
+    ///
+    /// | Label |  Features |
+    /// | -- | -- |
+    /// |True |4,6,0 |
+    /// |False|0,7,5 |
+    /// |True |4,7,0 |
+    /// |False|0,7,0 |
+    ///
+    /// This is how the dataset above would look, after fitting the estimator, and transforming the data with the resulting transformer:
+    ///
+    /// | Label |  Features |
+    /// | -- | -- |
+    /// |True |4,0 |
+    /// |False|0,5 |
+    /// |True |4,0 |
+    /// |False|0,5 |
+    ///
+    /// ]]></format>
+    /// </remarks>
+    /// <seealso cref="FeatureSelectionCatalog.SelectFeaturesBasedOnMutualInformation(TransformsCatalog.FeatureSelectionTransforms, InputOutputColumnPair[], string, int, int)"/>
+    /// <seealso cref="FeatureSelectionCatalog.SelectFeaturesBasedOnMutualInformation(TransformsCatalog.FeatureSelectionTransforms, string, string, string, int, int)"/>
     public sealed class MutualInformationFeatureSelectingEstimator : IEstimator<ITransformer>
     {
         internal const string Summary =

diff --git a/src/Microsoft.ML.Transforms/doc.xml b/src/Microsoft.ML.Transforms/doc.xml
@@ -57,61 +57,6 @@
       </example>
     </example>
 
-    <member name="CountFeatureSelection">
-      <summary>
-        Selects the slots for which the count of non-default values is greater than or equal to a threshold.
-      </summary>
-      <remarks>
-        <para>
-          This transform uses a set of aggregators to count the number of non-default values for each slot and
-          instantiates a <see cref="T:Microsoft.ML.Transforms.SlotsDroppingTransformer"/> to actually drop the slots.
-          This transform is useful when applied together with a <see cref="T:Microsoft.ML.Transforms.OneHotHashEncodingTransformer"/>. 
-          The count feature selection can remove those features generated by the hash transform that have no data in the examples.
-        </para>
-      </remarks>
-    </member>
-    <example name="CountFeatureSelection">
-       <example>
-        <code language="csharp">
-          pipeline.Add(new FeatureSelectorByCount
-          { 
-            Column = new[]{ &quot;Feature1&quot; }, 
-            Count = 2 
-          });
-        </code>
-      </example>
-    </example>
-
-    <member name="MutualInformationFeatureSelection">
-      <summary>
-        Selects the top k slots across all specified columns ordered by their mutual information with the label column.
-      </summary>
-      <remarks>
-        <para>
-          The mutual information of two random variables X and Y is a measure of the mutual dependence between the variables.
-          Formally, the mutual information can be written as:
-        </para>
-        <para>I(X;Y) = E[log(p(x,y)) - log(p(x)) - log(p(y))]</para>
-        <para>where the expectation is taken over the joint distribution of X and Y. 
-        Here p(x,y) is the joint probability density function of X and Y, p(x) and p(y) are the marginal probability density functions of X and Y respectively. 
-        In general, a higher mutual information between the dependent variable (or label) and an independent variable (or feature) means 
-        that the label has higher mutual dependence over that feature.
-        It keeps the top SlotsInOutput features with the largest mutual information with the label.
-        </para>
-      </remarks>
-    </member>
-    <example name="MutualInformationFeatureSelection">
-      <example>
-        <code language="csharp">
-          pipeline.Add(new FeatureSelectorByMutualInformation
-          { 
-            Column = new[]{ &quot;Feature1&quot; }, 
-            SlotsInOutput = 6 
-           });
-        </code>
-      </example>
-    </example>
-
     <member name="OptionalColumnTransform">
       <summary>
         Creates a new column with the specified type and default values.