diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/SimpleDataViewImplementation.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/SimpleDataViewImplementation.cs new file mode 100644 index 0000000000..b1a134b192 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/SimpleDataViewImplementation.cs @@ -0,0 +1,254 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML; +using Microsoft.ML.Data; + +namespace Samples.Dynamic +{ + /// + /// The interface is the central concept of "data" in ML.NET. While many conveniences exist + /// to create pre-baked implementations, it is also useful to know how to create one completely from scratch. We also + /// take this opportunity to illustrate and motivate the basic principles of how the IDataView system is architected, + /// since people interested in implementing need at least some knowledge of those principles. + /// + public static class SimpleDataViewImplementation + { + public static void Example() + { + // First we create an array of these objects, which we "present" as this IDataView implementation so that it + // can be used in a simple ML.NET pipeline. + var inputArray = new[] + { + new InputObject(false, "Hello my friend."), + new InputObject(true, "Stay awhile and listen."), + new InputObject(true, "Masterfully done hero!") + }; + var dataView = new InputObjectDataView(inputArray); + + // So, this is a very simple pipeline: a transformer that tokenizes Text, does nothing with the Label column + // at all. + var mlContext = new MLContext(); + var transformedDataView = mlContext.Transforms.Text.TokenizeIntoWords( + "TokenizedText", "Text").Fit(dataView).Transform(dataView); + + var textColumn = transformedDataView.Schema["Text"]; + var tokensColumn = transformedDataView.Schema["TokenizedText"]; + + using (var cursor = transformedDataView.GetRowCursor(new[] { textColumn, tokensColumn })) + { + // Note that it is best to get the getters and values *before* iteration, so as to faciliate buffer + // sharing (if applicable), and column-type validation once, rather than many times. + ReadOnlyMemory textValue = default; + VBuffer> tokensValue = default; + + var textGetter = cursor.GetGetter>(textColumn); + var tokensGetter = cursor.GetGetter>>(tokensColumn); + + while (cursor.MoveNext()) + { + textGetter(ref textValue); + tokensGetter(ref tokensValue); + + Console.WriteLine($"{textValue} => {string.Join(", ", tokensValue.DenseValues())}"); + } + + // The output to console is this: + + // Hello my friend. => Hello, my, friend. + // Stay awhile and listen. => Stay, awhile, and, listen. + // Masterfully done hero! => Masterfully, done, hero! + + // Note that it may be interesting to set a breakpoint on the Console.WriteLine, and explore + // what is going on with the cursor, and the buffers. In particular, on the third iteration, + // while `tokensValue` is logically presented as a three element array, internally you will + // see that the arrays internal to that structure have (at least) four items, specifically: + // `Masterfully`, `done`, `hero!`, `listen.`. In this way we see a simple example of the details + // of how buffer sharing from one iteration to the next actually works. + } + } + + private sealed class InputObject + { + public bool Label { get; } + public string Text { get; } + + public InputObject(bool label, string text) + { + Label = label; + Text = text; + } + } + + /// + /// This is an implementation of that wraps an + /// of the above . Note that normally under these circumstances, the first + /// recommendation would be to use a convenience like + /// + /// or something like that, rather than implementing outright. However, sometimes when + /// code generation is impossible on some situations, like Unity or other similar platforms, implementing + /// something even closely resembling this may become necessary. + /// + /// This implementation of , being didactic, is much simpler than practically + /// anything one would find in the ML.NET codebase. In this case we have a completely fixed schema (the two + /// fields of ), with fixed types. + /// + /// For , note that we keep a very simple schema based off the members of the object. You + /// may in fact note that it is possible in this specific case, this implementation of + /// could share the same object across all instances of this + /// object, but since this is almost never the case, I do not take advantage of that. + /// + /// We have chosen to wrap an , so in fact only a very simple implementation is + /// possible. Specifically: we cannot meaningfully shuffle (so is + /// , and even if a parameter were passed to + /// , we could not make use of it), we do + /// not know the count of the item right away without counting (so, it is most correct for + /// to return , even after we might hypothetically know after + /// the first pass, given the immutability principle of ), and the + /// method returns a single item. + /// + /// The derived class has more documentation specific to its behavior. + /// + /// Note that this implementation, as well as the nested derived class, does + /// almost no validation of parameters or guard against misuse than we would like from, say, implementations of + /// the same classes within the ML.NET codebase. + /// + private sealed class InputObjectDataView : IDataView + { + private readonly IEnumerable _data; + public DataViewSchema Schema { get; } + public bool CanShuffle => false; + + public InputObjectDataView(IEnumerable data) + { + _data = data; + + var builder = new DataViewSchema.Builder(); + builder.AddColumn("Label", BooleanDataViewType.Instance); + builder.AddColumn("Text", TextDataViewType.Instance); + Schema = builder.ToSchema(); + } + + public long? GetRowCount() => null; + + public DataViewRowCursor GetRowCursor(IEnumerable columnsNeeded, Random rand = null) + => new Cursor(this, columnsNeeded.Any(c => c.Index == 0), columnsNeeded.Any(c => c.Index == 1)); + + public DataViewRowCursor[] GetRowCursorSet(IEnumerable columnsNeeded, int n, Random rand = null) + => new[] { GetRowCursor(columnsNeeded, rand) }; + + /// + /// Having this be a private sealed nested class follows the typical pattern: in most + /// implementations, the cursor instance is almost always that. The only "common" + /// exceptions to this tendency are those implementations that are such thin wrappings of existing + /// without even bothering to change the schema. + /// + /// On the subject of schema, note that there is an expectation that the object is + /// reference equal to the object that created this cursor, as we see here. + /// + /// Note that returns 0. As described in the documentation of that property, that + /// is meant to facilitate the reconciliation of the partitioning of the data in the case where multiple + /// cursors are returned from + /// , but since only one is + /// ever returned from the implementation, this behavior is appropriate. + /// + /// Similarly, since it is impossible to have a shuffled cursor or a cursor set, it is sufficient for the + /// implementation to return a simple ID based on the position. If, however, this + /// had been something built on, hypothetically, an or some other such structure, and + /// shuffling and partitioning was available, an ID based on the index of whatever item was being returned + /// would be appropriate. + /// + /// Note the usage of the parameters on the + /// implementations. This is most valuable in the case of buffer sharing for , but + /// we still of course have to deal with it here. + /// + /// Note also that we spend a considerable amount of effort to not make the + /// and + /// methods correctly reflect what was asked for from + /// the + /// method that was used to create this method. In this particular case, the point is somewhat moot: this + /// mechanism exists to enable lazy evaluation, but since this cursor is implemented to wrap an + /// which has no concept of lazy evaluation, there is no real practical benefit + /// to doing this. However, it is best of course to illustrate the general principle for the sake of the + /// example. + /// + /// Even in this simple form, we see the reason why + /// is beneficial: the implementations themselves are simple to the point + /// where their operation is dwarfed by the simple acts of casting and validation checking one sees in + /// . In this way we only pay the cost of validation + /// and casting once, not every time we get a value. + /// + private sealed class Cursor : DataViewRowCursor + { + private bool _disposed; + private long _position; + private readonly IEnumerator _enumerator; + private readonly Delegate[] _getters; + + public override long Position => _position; + public override long Batch => 0; + public override DataViewSchema Schema { get; } + + public Cursor(InputObjectDataView parent, bool wantsLabel, bool wantsText) + { + Schema = parent.Schema; + _position = -1; + _enumerator = parent._data.GetEnumerator(); + _getters = new Delegate[] + { + wantsLabel ? (ValueGetter)LabelGetterImplementation : null, + wantsText ? (ValueGetter>)TextGetterImplementation : null + }; + } + + protected override void Dispose(bool disposing) + { + if (_disposed) + return; + if (disposing) + { + _enumerator.Dispose(); + _position = -1; + } + _disposed = true; + base.Dispose(disposing); + } + + private void LabelGetterImplementation(ref bool value) + => value = _enumerator.Current.Label; + + private void TextGetterImplementation(ref ReadOnlyMemory value) + => value = _enumerator.Current.Text.AsMemory(); + + private void IdGetterImplementation(ref DataViewRowId id) + => id = new DataViewRowId((ulong)_position, 0); + + public override ValueGetter GetGetter(DataViewSchema.Column column) + { + if (!IsColumnActive(column)) + throw new ArgumentOutOfRangeException(nameof(column)); + return (ValueGetter)_getters[column.Index]; + } + + public override ValueGetter GetIdGetter() + => IdGetterImplementation; + + public override bool IsColumnActive(DataViewSchema.Column column) + => _getters[column.Index] != null; + + public override bool MoveNext() + { + if (_disposed) + return false; + if (_enumerator.MoveNext()) + { + _position++; + return true; + } + Dispose(); + return false; + } + } + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj b/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj index 22c78a4a4a..ea16ed7bf3 100644 --- a/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj +++ b/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj @@ -5,6 +5,7 @@ Exe false false + Samples diff --git a/src/Microsoft.ML.DataView/IDataView.cs b/src/Microsoft.ML.DataView/IDataView.cs index 0bf632f5ef..6776990681 100644 --- a/src/Microsoft.ML.DataView/IDataView.cs +++ b/src/Microsoft.ML.DataView/IDataView.cs @@ -13,6 +13,13 @@ namespace Microsoft.ML /// The input and output of Query Operators (Transforms). This is the fundamental data pipeline /// type, comparable to for LINQ. /// + /// + /// + /// + /// + /// public interface IDataView { ///