diff --git a/src/Microsoft.ML.Data/Transforms/doc.xml b/src/Microsoft.ML.Data/Transforms/doc.xml
index a3d4ba9f5e..13f108a107 100644
--- a/src/Microsoft.ML.Data/Transforms/doc.xml
+++ b/src/Microsoft.ML.Data/Transforms/doc.xml
@@ -28,7 +28,7 @@
       </summary>
       <remarks>
         The TextToKeyConverter transform builds up term vocabularies (dictionaries).
-        The TextToKey Converter and the <see cref="T:Microsoft.ML.Transforms.HashConverter"/> are the two one primary mechanisms by which raw input is transformed into keys.
+        The TextToKeyConverter and the <see cref="T:Microsoft.ML.Transforms.HashConverter"/> are the two one primary mechanisms by which raw input is transformed into keys.
         If multiple columns are used, each column builds/uses exactly one vocabulary.
         The output columns are KeyType-valued.
         The Key value is the one-based index of the item in the dictionary.
@@ -49,6 +49,52 @@
         </code>
       </example>
     </example>
+
+    <member name="NAHandle">
+      <summary>
+        Handle missing values by replacing them with either the default value or the indicated value.
+      </summary>
+      <remarks>
+        This transform handles missing values in the input columns. For each input column, it creates an output column
+        where the missing values are replaced by one of these specified values:
+        <list type='bullet'>
+          <item>
+            <description>The default value of the appropriate type.</description>
+          </item>
+          <item>
+            <description>The mean value of the appropriate type.</description>
+          </item>
+          <item>
+            <description>The max value of the appropriate type.</description>
+          </item>
+          <item>
+            <description>The min value of the appropriate type.</description>
+          </item>
+        </list>
+        <para>The last three work only for numeric/TimeSpan/DateTime kind columns.</para>
+        <para>
+          The output column can also optionally include an indicator vector for which slots were missing in the input column.
+          This can be done only when the indicator vector type can be converted to the input column type, i.e. only for numeric columns.
+        </para>
+        <para>
+          When computing the mean/max/min value, there is also an option to compute it over the whole column instead of per slot.
+          This option has a default value of true for variable length vectors, and false for known length vectors.
+          It can be changed to true for known length vectors, but it results in an error if changed to false for variable length vectors.
+        </para>
+      </remarks>
+      <seealso cref=" Microsoft.ML.Runtime.Data.MetadataUtils.Kinds.HasMissingValues"/>
+      <seealso cref="T:Microsoft.ML.Data.DataKind"/>
+    </member>
+    <example name="NAHandle">
+      <example>
+        <code language="csharp">
+          pipeline.Add(new MissingValueHandler(&quot;FeatureCol&quot;, &quot;CleanFeatureCol&quot;)
+          {
+          ReplaceWith  = NAHandleTransformReplacementKind.Mean
+          });
+        </code>
+      </example>
+    </example>
     
   </members>
 </doc>
diff --git a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizer.cs b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizer.cs
index 0e5a4c1862..f404f3ae95 100644
--- a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizer.cs
+++ b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizer.cs
@@ -807,7 +807,7 @@ public static partial class TreeFeaturize
             Desc = TreeEnsembleFeaturizerTransform.TreeEnsembleSummary,
             UserName = TreeEnsembleFeaturizerTransform.UserName,
             ShortName = TreeEnsembleFeaturizerBindableMapper.LoadNameShort,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name=""TreeEnsembleFeaturizerTransform""]'/>" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name=""TreeEnsembleFeaturizerTransform""]/*'/>" })]
         public static CommonOutputs.TransformOutput Featurizer(IHostEnvironment env, TreeEnsembleFeaturizerTransform.ArgumentsForEntryPoint input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.FastTree/doc.xml b/src/Microsoft.ML.FastTree/doc.xml
index 8678654182..26d3c8c129 100644
--- a/src/Microsoft.ML.FastTree/doc.xml
+++ b/src/Microsoft.ML.FastTree/doc.xml
@@ -95,7 +95,7 @@
         <para>Generally, ensemble models provide better coverage and accuracy than single decision trees.
          Each tree in a decision forest outputs a Gaussian distribution.</para>
          <para>For more see: </para>
-        <list>
+        <list  type='bullet'>
           <item><description><a href='http://en.wikipedia.org/wiki/Random_forest'>Wikipedia: Random forest</a></description></item>
           <item><description><a href='http://jmlr.org/papers/volume7/meinshausen06a/meinshausen06a.pdf'>Quantile regression forest</a></description></item>
           <item><description><a href='https://blogs.technet.microsoft.com/machinelearning/2014/09/10/from-stumps-to-trees-to-forests/'>From Stumps to Trees to Forests</a></description></item>
@@ -146,7 +146,7 @@
       <summary>
         Trains a tree ensemble, or loads it from a file, then maps a numeric feature vector
         to three outputs:
-        <list>
+        <list type='number'>
           <item><description>A vector containing the individual tree outputs of the tree ensemble.</description></item>
           <item><description>A vector indicating the leaves that the feature vector falls on in the tree ensemble.</description></item>
           <item><description>A vector indicating the paths that the feature vector falls on in the tree ensemble.</description></item>
@@ -157,28 +157,28 @@
       </summary>
       <remarks>
         In machine learning​ it is a pretty common and powerful approach to utilize the already trained model in the process of defining features.
-        <para>One such example would be the use of model's scores as features to downstream models. For example, we might run clustering on the original features, 
+        <para>One such example would be the use of model&apos;s scores as features to downstream models. For example, we might run clustering on the original features, 
         and use the cluster distances as the new feature set.
-        Instead of consuming the model's output, we could go deeper, and extract the 'intermediate outputs' that are used to produce the final score. </para>
+        Instead of consuming the model&apos;s output, we could go deeper, and extract the &apos;intermediate outputs&apos; that are used to produce the final score. </para>
         There are a number of famous or popular examples of this technique:
-        <list>
-          <item><description>A deep neural net trained on the ImageNet dataset, with the last layer removed, is commonly used to compute the 'projection' of the image into the 'semantic feature space'.
-            It is observed that the Euclidean distance in this space often correlates with the 'semantic similarity': that is, all pictures of pizza are located close together,
+        <list type='bullet'>
+          <item><description>A deep neural net trained on the ImageNet dataset, with the last layer removed, is commonly used to compute the &apos;projection&apos; of the image into the &apos;semantic feature space&apos;.
+            It is observed that the Euclidean distance in this space often correlates with the &apos;semantic similarity&apos;: that is, all pictures of pizza are located close together,
             and far away from pictures of kittens. </description></item>
-          <item><description>A matrix factorization and/or LDA model is also often used to extract the 'latent topics' or 'latent features' associated with users and items.</description></item>
-          <item><description>The weights of the linear model are often used as a crude indicator of 'feature importance'. At the very minimum, the 0-weight features are not needed by the model,
-            and there's no reason to compute them. </description></item>
+          <item><description>A matrix factorization and/or LDA model is also often used to extract the &apos;latent topics&apos; or &apos;latent features&apos; associated with users and items.</description></item>
+          <item><description>The weights of the linear model are often used as a crude indicator of &apos;feature importance&apos;. At the very minimum, the 0-weight features are not needed by the model,
+            and there&apos;s no reason to compute them. </description></item>
         </list>
         <para>Tree featurizer uses the decision tree ensembles for feature engineering in the same fashion as above.</para>
-        <para>Let's assume that we've built a tree ensemble of 100 trees with 100 leaves each (it doesn't matter whether boosting was used or not in training). 
+        <para>Let&apos;s assume that we&apos;ve built a tree ensemble of 100 trees with 100 leaves each (it doesn&apos;t matter whether boosting was used or not in training). 
         If we associate each leaf of each tree with a sequential integer, we can, for every incoming example x, 
-        produce an indicator vector L(x), where Li(x) = 1 if the example x 'falls' into the leaf #i, and 0 otherwise.</para>
+        produce an indicator vector L(x), where Li(x) = 1 if the example x &apos;falls&apos; into the leaf #i, and 0 otherwise.</para>
         <para>Thus, for every example x, we produce a 10000-valued vector L, with exactly 100 1s and the rest zeroes. 
-        This 'leaf indicator' vector can be considered the ensemble-induced 'footprint' of the example.</para>
-        <para>The 'distance' between two examples in the L-space is actually a Hamming distance, and is equal to the number of trees that do not distinguish the two examples.</para>
+        This &apos;leaf indicator&apos; vector can be considered the ensemble-induced &apos;footprint&apos; of the example.</para>
+        <para>The &apos;distance&apos; between two examples in the L-space is actually a Hamming distance, and is equal to the number of trees that do not distinguish the two examples.</para>
         <para>We could repeat the same thought process for the non-leaf, or internal, nodes of the trees (we know that each tree has exactly 99 of them in our 100-leaf example), 
-        and produce another indicator vector, N (size 9900), for each example, indicating the 'trajectory' of each example through each of the trees.</para>
-        <para>The distance in the combined 19900-dimensional LN-space will be equal to the number of 'decisions' in all trees that 'agree' on the given pair of examples.</para>
+        and produce another indicator vector, N (size 9900), for each example, indicating the &apos;trajectory&apos; of each example through each of the trees.</para>
+        <para>The distance in the combined 19900-dimensional LN-space will be equal to the number of &apos;decisions&apos; in all trees that &apos;agree&apos; on the given pair of examples.</para>
         <para>The TreeLeafFeaturizer is also producing the third vector, T, which is defined as Ti(x) = output of tree #i on example x.</para>
       </remarks>
       <example>
diff --git a/src/Microsoft.ML.KMeansClustering/doc.xml b/src/Microsoft.ML.KMeansClustering/doc.xml
index a1590595dc..b4318de334 100644
--- a/src/Microsoft.ML.KMeansClustering/doc.xml
+++ b/src/Microsoft.ML.KMeansClustering/doc.xml
@@ -13,7 +13,7 @@
         YYK-Means observes that there is a lot of redundancy across iterations in the KMeans algorithms and most points do not change their clusters during an iteration.
         It uses various bounding techniques to identify this redundancy and eliminate many distance computations and optimize centroid computations.
         <para>For more information on K-means, and K-means++ see:</para>
-        <list>
+        <list type='bullet'>
           <item><description><a href='https://en.wikipedia.org/wiki/K-means_clustering'>K-means</a></description></item>
           <item><description><a href='https://en.wikipedia.org/wiki/K-means%2b%2b'>K-means++</a></description></item>
         </list>
diff --git a/src/Microsoft.ML.PCA/doc.xml b/src/Microsoft.ML.PCA/doc.xml
index c4f0be7758..5054950c2d 100644
--- a/src/Microsoft.ML.PCA/doc.xml
+++ b/src/Microsoft.ML.PCA/doc.xml
@@ -11,7 +11,7 @@
       Its training is done using the technique described in the paper: <a href='https://arxiv.org/pdf/1310.6304v2.pdf'>Combining Structured and Unstructured Randomness in Large Scale PCA</a>,
       and the paper <a href='https://arxiv.org/pdf/0909.4061v2.pdf'>Finding Structure with Randomness: Probabilistic Algorithms for Constructing Approximate Matrix Decompositions</a>
         <para>For more information, see also:</para>
-        <list>
+        <list type='bullet'>
           <item><description>
             <a href='http://web.stanford.edu/group/mmds/slides2010/Martinsson.pdf'>Randomized Methods for Computing the Singular Value Decomposition (SVD) of very large matrices</a>
           </description></item>
diff --git a/src/Microsoft.ML.StandardLearners/FactorizationMachine/doc.xml b/src/Microsoft.ML.StandardLearners/FactorizationMachine/doc.xml
index f18bf60990..bdcb973439 100644
--- a/src/Microsoft.ML.StandardLearners/FactorizationMachine/doc.xml
+++ b/src/Microsoft.ML.StandardLearners/FactorizationMachine/doc.xml
@@ -15,14 +15,14 @@
         <para>See references below for more details. 
         This trainer is essentially faster the one introduced in [2] because of some implemtation tricks[3].
         </para>
-          <list >
+          <list type='bullet'>
             <item>
-              [1] <description><a href='http://www.csie.ntu.edu.tw/~cjlin/papers/ffm.pdf'>Field-aware Factorization Machines for CTR Prediction</a></description></item>
+              <description><a href='http://www.csie.ntu.edu.tw/~cjlin/papers/ffm.pdf'>Field-aware Factorization Machines for CTR Prediction</a></description></item>
             <item>
-              [2] <description><a href='http://jmlr.org/papers/volume12/duchi11a/duchi11a.pdf'>Adaptive Subgradient Methods for Online Learning and Stochastic Optimization</a></description>
+              <description><a href='http://jmlr.org/papers/volume12/duchi11a/duchi11a.pdf'>Adaptive Subgradient Methods for Online Learning and Stochastic Optimization</a></description>
             </item>
             <item>
-              [3] <description><a href='https://github.com/wschin/fast-ffm/blob/master/fast-ffm.pdf'>An Improved Stochastic Gradient Method for Training Large-scale Field-aware Factorization Machine.</a></description>
+              <description><a href='https://github.com/wschin/fast-ffm/blob/master/fast-ffm.pdf'>An Improved Stochastic Gradient Method for Training Large-scale Field-aware Factorization Machine.</a></description>
             </item>
           </list>
       </remarks>
diff --git a/src/Microsoft.ML.StandardLearners/Standard/MultiClass/MultiClassNaiveBayesTrainer.cs b/src/Microsoft.ML.StandardLearners/Standard/MultiClass/MultiClassNaiveBayesTrainer.cs
index 0a73d55395..8c96ee1e0b 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/MultiClass/MultiClassNaiveBayesTrainer.cs
+++ b/src/Microsoft.ML.StandardLearners/Standard/MultiClass/MultiClassNaiveBayesTrainer.cs
@@ -123,8 +123,8 @@ public override MultiClassNaiveBayesPredictor Train(TrainContext context)
             Desc = "Train a MultiClassNaiveBayesTrainer.",
             UserName = UserName,
             ShortName = ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/MultiClass/doc.xml' path='doc/members/member[@name=""MultiClassNaiveBayesTrainer""]'/>",
-                                 @"<include file='../Microsoft.ML.StandardLearners/Standard/MultiClass/doc.xml' path='doc/members/example[@name=""MultiClassNaiveBayesTrainer""]'/>" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/MultiClass/doc.xml' path='doc/members/member[@name=""MultiClassNaiveBayesTrainer""]/*'/>",
+                                 @"<include file='../Microsoft.ML.StandardLearners/Standard/MultiClass/doc.xml' path='doc/members/example[@name=""MultiClassNaiveBayesTrainer""]/*'/>" })]
         public static CommonOutputs.MulticlassClassificationOutput TrainMultiClassNaiveBayesTrainer(IHostEnvironment env, Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.StandardLearners/Standard/Online/doc.xml b/src/Microsoft.ML.StandardLearners/Standard/Online/doc.xml
index 0ace721221..2ad6e77aa0 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/Online/doc.xml
+++ b/src/Microsoft.ML.StandardLearners/Standard/Online/doc.xml
@@ -13,8 +13,8 @@
         and an option to update the weight vector using the average of the vectors seen over time (averaged argument is set to True by default).
       </remarks>
     </member>
-    <example>
-      <example name="OGD">
+    <example name="OGD">
+      <example>
         <code language="csharp">
           new OnlineGradientDescentRegressor
           {
diff --git a/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/doc.xml b/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/doc.xml
index ec14c9446b..975f1eb2ff 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/doc.xml
+++ b/src/Microsoft.ML.StandardLearners/Standard/PoissonRegression/doc.xml
@@ -12,8 +12,8 @@
         Assuming that the dependent variable follows a Poisson distribution, the parameters of the regressor can be estimated by maximizing the likelihood of the obtained observations.
       </remarks>
     </member>
-    <example>
-      <example name="PoissonRegression">
+    <example name="PoissonRegression">
+      <example>
         <code language="csharp">
           new PoissonRegressor
           {
diff --git a/src/Microsoft.ML.StandardLearners/Standard/doc.xml b/src/Microsoft.ML.StandardLearners/Standard/doc.xml
index a704827b88..eb87605232 100644
--- a/src/Microsoft.ML.StandardLearners/Standard/doc.xml
+++ b/src/Microsoft.ML.StandardLearners/Standard/doc.xml
@@ -22,7 +22,7 @@
           In general, the larger the 'L2Const', the faster SDCA converges.
         </para>
         <para>For more information, see also:</para>
-        <list>
+        <list type='bullet'>
           <item><description>
             <a href='https://www.microsoft.com/en-us/research/wp-content/uploads/2016/06/main-3.pdf'>Scaling Up Stochastic Dual Coordinate Ascent</a>.
           </description></item>
diff --git a/src/Microsoft.ML.Transforms/EntryPoints/SelectFeatures.cs b/src/Microsoft.ML.Transforms/EntryPoints/SelectFeatures.cs
index c3ab4ea5e0..5733f84b6b 100644
--- a/src/Microsoft.ML.Transforms/EntryPoints/SelectFeatures.cs
+++ b/src/Microsoft.ML.Transforms/EntryPoints/SelectFeatures.cs
@@ -14,8 +14,8 @@ public static class SelectFeatures
         [TlcModule.EntryPoint(Name = "Transforms.FeatureSelectorByCount",
             Desc = CountFeatureSelectionTransform.Summary,
             UserName = CountFeatureSelectionTransform.UserName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name=""CountFeatureSelection""]'/>",
-                                 @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/example[@name=""CountFeatureSelection""]'/>"})]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name=""CountFeatureSelection""]/*'/>",
+                                 @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/example[@name=""CountFeatureSelection""]/*'/>"})]
         public static CommonOutputs.TransformOutput CountSelect(IHostEnvironment env, CountFeatureSelectionTransform.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
@@ -31,8 +31,8 @@ public static CommonOutputs.TransformOutput CountSelect(IHostEnvironment env, Co
             Desc = MutualInformationFeatureSelectionTransform.Summary,
             UserName = MutualInformationFeatureSelectionTransform.UserName,
             ShortName = MutualInformationFeatureSelectionTransform.ShortName,
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name=""MutualInformationFeatureSelection""]'/>",
-                                 @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/example[@name=""MutualInformationFeatureSelection""]'/>"})]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name=""MutualInformationFeatureSelection""]/*'/>",
+                                 @"<include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/example[@name=""MutualInformationFeatureSelection""]/*'/>"})]
         public static CommonOutputs.TransformOutput MutualInformationSelect(IHostEnvironment env, MutualInformationFeatureSelectionTransform.Arguments input)
         {
             Contracts.CheckValue(env, nameof(env));
diff --git a/src/Microsoft.ML.Transforms/MutualInformationFeatureSelection.cs b/src/Microsoft.ML.Transforms/MutualInformationFeatureSelection.cs
index 55330cb6fb..0af833a046 100644
--- a/src/Microsoft.ML.Transforms/MutualInformationFeatureSelection.cs
+++ b/src/Microsoft.ML.Transforms/MutualInformationFeatureSelection.cs
@@ -21,7 +21,7 @@
 
 namespace Microsoft.ML.Runtime.Data
 {
-    /// <include file='doc.xml' path='doc/members/member[@name="MutualInformationFeatureSelection"]' />
+    /// <include file='doc.xml' path='doc/members/member[@name="MutualInformationFeatureSelection"]/*' />
     public static class MutualInformationFeatureSelectionTransform
     {
         public const string Summary =
diff --git a/src/Microsoft.ML.Transforms/doc.xml b/src/Microsoft.ML.Transforms/doc.xml
index cb6ef6af25..63d2765afc 100644
--- a/src/Microsoft.ML.Transforms/doc.xml
+++ b/src/Microsoft.ML.Transforms/doc.xml
@@ -7,8 +7,7 @@
         Encodes the categorical variable with hash-based encoding. 
       </summary>
       <remarks>
-        CategoricalHashOneHotVectorizer converts a categorical value into an indicator array by hashing the
-        value and using the hash as an index in the bag.
+        CategoricalHashOneHotVectorizer converts a categorical value into an indicator array by hashing the value and using the hash as an index in the bag.
         If the input column is a vector, a single indicator bag is returned for it.
       </remarks>
     </member>
@@ -33,16 +32,16 @@
         <para>The CategoricalOneHotVectorizer transform passes through a data set, operating on text columns, to
         build a dictionary of categories.
         For each row, the entire text string appearing in the input column is defined as a category.
-        The output of this transform is an indicator vector.
+        The output of this transform is an indicator vector.</para>
         Each slot in this vector corresponds to a category in the dictionary, so its length is the size of the built dictionary.
-        The CategoricalOneHotVectorizer can be applied to one or more columns, in which case it builds and uses a separate dictionary
+        <para>The CategoricalOneHotVectorizer can be applied to one or more columns, in which case it builds and uses a separate dictionary
         for each column that it is applied to.</para>
         
-        <para>The <see cref="Microsoft.ML.Runtime.Data.CategoricalTransform.OutputKind"/> produces integer values and <see cref="KeyType"/> columns.
+        <para>The <see cref="T:Microsoft.ML.Transforms.CategoricalTransformOutputKind"/> produces integer values and KeyType columns.
         The Key value is the one-based index of the slot set in the Ind/Bag options.
         If the Key option is not found, it is assigned the value zero.
-        In the <see cref="CategoricalTransform.OutputKind.Ind"/>, <see cref="CategoricalTransform.OutputKind.Bag"/> options are not found, they result in an all zero bit vector.
-        <see cref="CategoricalTransform.OutputKind.Ind"/> and <see cref="CategoricalTransform.OutputKind.Bag"/> differ simply in how the bit-vectors generated from individual slots are aggregated:
+        In the <see cref="F:Microsoft.ML.Transforms.CategoricalTransformOutputKind.Ind"/>, <see cref="F:Microsoft.ML.Transforms.CategoricalTransformOutputKind.Bag"/> options are not found, they result in an all zero bit vector.
+        <see cref="F:Microsoft.ML.Transforms.CategoricalTransformOutputKind.Ind"/> and <see cref="F:Microsoft.ML.Transforms.CategoricalTransformOutputKind.Bag"/> differ simply in how the bit-vectors generated from individual slots are aggregated:
         for Ind they are concatenated and for Bag they are added.
         When the source column is a singleton, the Ind and Bag options are identical.</para>
       </remarks>
@@ -117,8 +116,7 @@
         Creates a new column with the specified type and default values.
       </summary>
       <remarks>
-        If the user wish to create additional columns with a particular type and default values,
-        or replicated the values from one column to another, changing their type, they can do so using this transform.
+        If the user wish to create additional columns with a particular type and default values, or replicated the values from one column to another, changing their type, they can do so using this transform.
         This transform can be used as a workaround to create a Label column after deserializing a model, for prediction.
         Some transforms in the serialized model operate on the Label column, and would throw errors during prediction if such a column is not found.
       </remarks>
@@ -206,53 +204,7 @@
         </code>
       </example>
     </example>
-
-    <member name="NAHandle">
-      <summary>
-        Handle missing values by replacing them with either the default value or the indicated value.
-      </summary>
-      <remarks>
-        This transform handles missing values in the input columns. For each input column, it creates an output column
-        where the missing values are replaced by one of these specified values:
-        <list type="bullet">
-          <item>
-            <description>The default value of the appropriate type.</description>
-          </item>
-          <item>
-            <description>The mean value of the appropriate type.</description>
-          </item>
-          <item>
-            <description>The max value of the appropriate type.</description>
-          </item>
-          <item>
-            <description>The min value of the appropriate type.</description>
-          </item>
-        </list>
-        <para>The last three work only for numeric/TimeSpan/DateTime kind columns.</para>
-        <para>
-          The output column can also optionally include an indicator vector for which slots were missing in the input column.
-          This can be done only when the indicator vector type can be converted to the input column type, i.e. only for numeric columns.
-        </para>
-        <para>
-          When computing the mean/max/min value, there is also an option to compute it over the whole column instead of per slot.
-          This option has a default value of true for variable length vectors, and false for known length vectors.
-          It can be changed to true for known length vectors, but it results in an error if changed to false for variable length vectors.
-        </para>
-      </remarks>
-      <seealso cref=" Microsoft.ML.Runtime.Data.MetadataUtils.Kinds.HasMissingValues"/>
-      <seealso cref="T:Microsoft.ML.Data.DataKind"/>
-    </member>
-    <example name="NAHandle">
-      <example>
-        <code language="csharp">
-          pipeline.Add(new MissingValueHandler(&quot;FeatureCol&quot;, &quot;CleanFeatureCol&quot;) 
-          { 
-              ReplaceWith  = NAHandleTransformReplacementKind.Mean 
-          });
-        </code>
-      </example>
-    </example>
-    
+  
     <member name="LpNormalize">
       <summary>
          The LpNormalizer transforms, normalizes vectors (rows) individually by rescaling them to unit norm (L2, L1 or LInf). 
@@ -325,8 +277,8 @@
               be ignored, and the missing slots will be 'padded' with default values.
             </description></item>
         </list>
-        <para>All metadata is preserved for the retained columns. For 'unrolled' columns, all known metadata
-        except slot names is preserved.
+        <para>All metadata are preserved for the retained columns. For 'unrolled' columns, all known metadata
+        except slot names are preserved.
         </para>
       </remarks>
     </member>
diff --git a/src/Microsoft.ML/CSharpApi.cs b/src/Microsoft.ML/CSharpApi.cs
index b2181fb256..3ac9a2acfb 100644
--- a/src/Microsoft.ML/CSharpApi.cs
+++ b/src/Microsoft.ML/CSharpApi.cs
@@ -3080,7 +3080,7 @@ public sealed partial class OneVersusAllMacroSubGraphOutput
 
         }
 
-        /// <include file='../Microsoft.ML.StandardLearners/Standard/MultiClass/doc.xml' path='doc/members/member[@name="OVA"]'/>
+        /// <include file='../Microsoft.ML.StandardLearners/Standard/MultiClass/doc.xml' path='doc/members/member[@name="OVA"]/*'/>
         public sealed partial class OneVersusAll : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithWeight, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -8601,8 +8601,8 @@ public LogisticRegressionClassifierPipelineStep(Output output)
     namespace Trainers
     {
 
-        /// <include file='../Microsoft.ML.StandardLearners/Standard/MultiClass/doc.xml' path='doc/members/member[@name="MultiClassNaiveBayesTrainer"]'/>
-        /// <include file='../Microsoft.ML.StandardLearners/Standard/MultiClass/doc.xml' path='doc/members/example[@name="MultiClassNaiveBayesTrainer"]'/>
+        /// <include file='../Microsoft.ML.StandardLearners/Standard/MultiClass/doc.xml' path='doc/members/member[@name="MultiClassNaiveBayesTrainer"]/*'/>
+        /// <include file='../Microsoft.ML.StandardLearners/Standard/MultiClass/doc.xml' path='doc/members/example[@name="MultiClassNaiveBayesTrainer"]/*'/>
         public sealed partial class NaiveBayesClassifier : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInputWithLabel, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -11417,8 +11417,8 @@ public FeatureCombinerPipelineStep(Output output)
     namespace Transforms
     {
 
-        /// <include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name="CountFeatureSelection"]'/>
-        /// <include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/example[@name="CountFeatureSelection"]'/>
+        /// <include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name="CountFeatureSelection"]/*'/>
+        /// <include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/example[@name="CountFeatureSelection"]/*'/>
         public sealed partial class FeatureSelectorByCount : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -11486,8 +11486,8 @@ public FeatureSelectorByCountPipelineStep(Output output)
     namespace Transforms
     {
 
-        /// <include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name="MutualInformationFeatureSelection"]'/>
-        /// <include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/example[@name="MutualInformationFeatureSelection"]'/>
+        /// <include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/member[@name="MutualInformationFeatureSelection"]/*'/>
+        /// <include file='../Microsoft.ML.Transforms/doc.xml' path='doc/members/example[@name="MutualInformationFeatureSelection"]/*'/>
         public sealed partial class FeatureSelectorByMutualInformation : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
@@ -15329,7 +15329,7 @@ public sealed class Output
     namespace Transforms
     {
 
-        /// <include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name="TreeEnsembleFeaturizerTransform"]'/>
+        /// <include file='../Microsoft.ML.FastTree/doc.xml' path='doc/members/member[@name="TreeEnsembleFeaturizerTransform"]/*'/>
         public sealed partial class TreeLeafFeaturizer : Microsoft.ML.Runtime.EntryPoints.CommonInputs.IFeaturizerInput, Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem
         {
 
diff --git a/src/Microsoft.ML/Runtime/EntryPoints/OneVersusAllMacro.cs b/src/Microsoft.ML/Runtime/EntryPoints/OneVersusAllMacro.cs
index 05688cd2af..3da05f1fbf 100644
--- a/src/Microsoft.ML/Runtime/EntryPoints/OneVersusAllMacro.cs
+++ b/src/Microsoft.ML/Runtime/EntryPoints/OneVersusAllMacro.cs
@@ -136,7 +136,7 @@ private static int GetNumberOfClasses(IHostEnvironment env, Arguments input, out
 
         [TlcModule.EntryPoint(Desc = "One-vs-All macro (OVA)",
             Name = "Models.OneVersusAll",
-            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/MultiClass/doc.xml' path='doc/members/member[@name=""OVA""]'/>" })]
+            XmlInclude = new[] { @"<include file='../Microsoft.ML.StandardLearners/Standard/MultiClass/doc.xml' path='doc/members/member[@name=""OVA""]/*'/>" })]
         public static CommonOutputs.MacroOutput<Output> OneVersusAll(
             IHostEnvironment env,
             Arguments input,