diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/SimpleDataViewImplementation.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/SimpleDataViewImplementation.cs
new file mode 100644
index 0000000000..b1a134b192
--- /dev/null
+++ b/docs/samples/Microsoft.ML.Samples/Dynamic/SimpleDataViewImplementation.cs
@@ -0,0 +1,254 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using Microsoft.ML;
+using Microsoft.ML.Data;
+
+namespace Samples.Dynamic
+{
+ ///
+ /// The interface is the central concept of "data" in ML.NET. While many conveniences exist
+ /// to create pre-baked implementations, it is also useful to know how to create one completely from scratch. We also
+ /// take this opportunity to illustrate and motivate the basic principles of how the IDataView system is architected,
+ /// since people interested in implementing need at least some knowledge of those principles.
+ ///
+ public static class SimpleDataViewImplementation
+ {
+ public static void Example()
+ {
+ // First we create an array of these objects, which we "present" as this IDataView implementation so that it
+ // can be used in a simple ML.NET pipeline.
+ var inputArray = new[]
+ {
+ new InputObject(false, "Hello my friend."),
+ new InputObject(true, "Stay awhile and listen."),
+ new InputObject(true, "Masterfully done hero!")
+ };
+ var dataView = new InputObjectDataView(inputArray);
+
+ // So, this is a very simple pipeline: a transformer that tokenizes Text, does nothing with the Label column
+ // at all.
+ var mlContext = new MLContext();
+ var transformedDataView = mlContext.Transforms.Text.TokenizeIntoWords(
+ "TokenizedText", "Text").Fit(dataView).Transform(dataView);
+
+ var textColumn = transformedDataView.Schema["Text"];
+ var tokensColumn = transformedDataView.Schema["TokenizedText"];
+
+ using (var cursor = transformedDataView.GetRowCursor(new[] { textColumn, tokensColumn }))
+ {
+ // Note that it is best to get the getters and values *before* iteration, so as to faciliate buffer
+ // sharing (if applicable), and column-type validation once, rather than many times.
+ ReadOnlyMemory textValue = default;
+ VBuffer> tokensValue = default;
+
+ var textGetter = cursor.GetGetter>(textColumn);
+ var tokensGetter = cursor.GetGetter>>(tokensColumn);
+
+ while (cursor.MoveNext())
+ {
+ textGetter(ref textValue);
+ tokensGetter(ref tokensValue);
+
+ Console.WriteLine($"{textValue} => {string.Join(", ", tokensValue.DenseValues())}");
+ }
+
+ // The output to console is this:
+
+ // Hello my friend. => Hello, my, friend.
+ // Stay awhile and listen. => Stay, awhile, and, listen.
+ // Masterfully done hero! => Masterfully, done, hero!
+
+ // Note that it may be interesting to set a breakpoint on the Console.WriteLine, and explore
+ // what is going on with the cursor, and the buffers. In particular, on the third iteration,
+ // while `tokensValue` is logically presented as a three element array, internally you will
+ // see that the arrays internal to that structure have (at least) four items, specifically:
+ // `Masterfully`, `done`, `hero!`, `listen.`. In this way we see a simple example of the details
+ // of how buffer sharing from one iteration to the next actually works.
+ }
+ }
+
+ private sealed class InputObject
+ {
+ public bool Label { get; }
+ public string Text { get; }
+
+ public InputObject(bool label, string text)
+ {
+ Label = label;
+ Text = text;
+ }
+ }
+
+ ///
+ /// This is an implementation of that wraps an
+ /// of the above . Note that normally under these circumstances, the first
+ /// recommendation would be to use a convenience like
+ ///
+ /// or something like that, rather than implementing outright. However, sometimes when
+ /// code generation is impossible on some situations, like Unity or other similar platforms, implementing
+ /// something even closely resembling this may become necessary.
+ ///
+ /// This implementation of , being didactic, is much simpler than practically
+ /// anything one would find in the ML.NET codebase. In this case we have a completely fixed schema (the two
+ /// fields of ), with fixed types.
+ ///
+ /// For , note that we keep a very simple schema based off the members of the object. You
+ /// may in fact note that it is possible in this specific case, this implementation of
+ /// could share the same object across all instances of this
+ /// object, but since this is almost never the case, I do not take advantage of that.
+ ///
+ /// We have chosen to wrap an , so in fact only a very simple implementation is
+ /// possible. Specifically: we cannot meaningfully shuffle (so is
+ /// , and even if a parameter were passed to
+ /// , we could not make use of it), we do
+ /// not know the count of the item right away without counting (so, it is most correct for
+ /// to return , even after we might hypothetically know after
+ /// the first pass, given the immutability principle of ), and the
+ /// method returns a single item.
+ ///
+ /// The derived class has more documentation specific to its behavior.
+ ///
+ /// Note that this implementation, as well as the nested derived class, does
+ /// almost no validation of parameters or guard against misuse than we would like from, say, implementations of
+ /// the same classes within the ML.NET codebase.
+ ///
+ private sealed class InputObjectDataView : IDataView
+ {
+ private readonly IEnumerable _data;
+ public DataViewSchema Schema { get; }
+ public bool CanShuffle => false;
+
+ public InputObjectDataView(IEnumerable data)
+ {
+ _data = data;
+
+ var builder = new DataViewSchema.Builder();
+ builder.AddColumn("Label", BooleanDataViewType.Instance);
+ builder.AddColumn("Text", TextDataViewType.Instance);
+ Schema = builder.ToSchema();
+ }
+
+ public long? GetRowCount() => null;
+
+ public DataViewRowCursor GetRowCursor(IEnumerable columnsNeeded, Random rand = null)
+ => new Cursor(this, columnsNeeded.Any(c => c.Index == 0), columnsNeeded.Any(c => c.Index == 1));
+
+ public DataViewRowCursor[] GetRowCursorSet(IEnumerable columnsNeeded, int n, Random rand = null)
+ => new[] { GetRowCursor(columnsNeeded, rand) };
+
+ ///
+ /// Having this be a private sealed nested class follows the typical pattern: in most
+ /// implementations, the cursor instance is almost always that. The only "common"
+ /// exceptions to this tendency are those implementations that are such thin wrappings of existing
+ /// without even bothering to change the schema.
+ ///
+ /// On the subject of schema, note that there is an expectation that the object is
+ /// reference equal to the object that created this cursor, as we see here.
+ ///
+ /// Note that returns 0. As described in the documentation of that property, that
+ /// is meant to facilitate the reconciliation of the partitioning of the data in the case where multiple
+ /// cursors are returned from
+ /// , but since only one is
+ /// ever returned from the implementation, this behavior is appropriate.
+ ///
+ /// Similarly, since it is impossible to have a shuffled cursor or a cursor set, it is sufficient for the
+ /// implementation to return a simple ID based on the position. If, however, this
+ /// had been something built on, hypothetically, an or some other such structure, and
+ /// shuffling and partitioning was available, an ID based on the index of whatever item was being returned
+ /// would be appropriate.
+ ///
+ /// Note the usage of the parameters on the
+ /// implementations. This is most valuable in the case of buffer sharing for , but
+ /// we still of course have to deal with it here.
+ ///
+ /// Note also that we spend a considerable amount of effort to not make the
+ /// and
+ /// methods correctly reflect what was asked for from
+ /// the
+ /// method that was used to create this method. In this particular case, the point is somewhat moot: this
+ /// mechanism exists to enable lazy evaluation, but since this cursor is implemented to wrap an
+ /// which has no concept of lazy evaluation, there is no real practical benefit
+ /// to doing this. However, it is best of course to illustrate the general principle for the sake of the
+ /// example.
+ ///
+ /// Even in this simple form, we see the reason why
+ /// is beneficial: the implementations themselves are simple to the point
+ /// where their operation is dwarfed by the simple acts of casting and validation checking one sees in
+ /// . In this way we only pay the cost of validation
+ /// and casting once, not every time we get a value.
+ ///
+ private sealed class Cursor : DataViewRowCursor
+ {
+ private bool _disposed;
+ private long _position;
+ private readonly IEnumerator _enumerator;
+ private readonly Delegate[] _getters;
+
+ public override long Position => _position;
+ public override long Batch => 0;
+ public override DataViewSchema Schema { get; }
+
+ public Cursor(InputObjectDataView parent, bool wantsLabel, bool wantsText)
+ {
+ Schema = parent.Schema;
+ _position = -1;
+ _enumerator = parent._data.GetEnumerator();
+ _getters = new Delegate[]
+ {
+ wantsLabel ? (ValueGetter)LabelGetterImplementation : null,
+ wantsText ? (ValueGetter>)TextGetterImplementation : null
+ };
+ }
+
+ protected override void Dispose(bool disposing)
+ {
+ if (_disposed)
+ return;
+ if (disposing)
+ {
+ _enumerator.Dispose();
+ _position = -1;
+ }
+ _disposed = true;
+ base.Dispose(disposing);
+ }
+
+ private void LabelGetterImplementation(ref bool value)
+ => value = _enumerator.Current.Label;
+
+ private void TextGetterImplementation(ref ReadOnlyMemory value)
+ => value = _enumerator.Current.Text.AsMemory();
+
+ private void IdGetterImplementation(ref DataViewRowId id)
+ => id = new DataViewRowId((ulong)_position, 0);
+
+ public override ValueGetter GetGetter(DataViewSchema.Column column)
+ {
+ if (!IsColumnActive(column))
+ throw new ArgumentOutOfRangeException(nameof(column));
+ return (ValueGetter)_getters[column.Index];
+ }
+
+ public override ValueGetter GetIdGetter()
+ => IdGetterImplementation;
+
+ public override bool IsColumnActive(DataViewSchema.Column column)
+ => _getters[column.Index] != null;
+
+ public override bool MoveNext()
+ {
+ if (_disposed)
+ return false;
+ if (_enumerator.MoveNext())
+ {
+ _position++;
+ return true;
+ }
+ Dispose();
+ return false;
+ }
+ }
+ }
+ }
+}
diff --git a/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj b/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj
index 22c78a4a4a..ea16ed7bf3 100644
--- a/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj
+++ b/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj
@@ -5,6 +5,7 @@
Exe
false
false
+ Samples
diff --git a/src/Microsoft.ML.DataView/IDataView.cs b/src/Microsoft.ML.DataView/IDataView.cs
index 0bf632f5ef..6776990681 100644
--- a/src/Microsoft.ML.DataView/IDataView.cs
+++ b/src/Microsoft.ML.DataView/IDataView.cs
@@ -13,6 +13,13 @@ namespace Microsoft.ML
/// The input and output of Query Operators (Transforms). This is the fundamental data pipeline
/// type, comparable to for LINQ.
///
+ ///
+ ///
+ ///
+ ///
+ ///
public interface IDataView
{
///