Skip to content

IDataView to DataFrame #5712

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Mar 22, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/Microsoft.Data.Analysis/DataFrame.IDataView.cs
Original file line number Diff line number Diff line change
@@ -11,7 +11,7 @@
namespace Microsoft.Data.Analysis
{
public partial class DataFrame : IDataView
{
{
// TODO: support shuffling
bool IDataView.CanShuffle => false;

@@ -53,6 +53,7 @@ private DataViewRowCursor GetRowCursorCore(IEnumerable<DataViewSchema.Column> co

return new RowCursor(this, activeColumns);
}

DataViewRowCursor IDataView.GetRowCursor(IEnumerable<DataViewSchema.Column> columnsNeeded, Random rand)
{
return GetRowCursorCore(columnsNeeded);
14 changes: 14 additions & 0 deletions src/Microsoft.Data.Analysis/DataFrameColumn.cs
Original file line number Diff line number Diff line change
@@ -247,6 +247,20 @@ public virtual DataFrameColumn Sort(bool ascending = true)
/// </param>
protected internal virtual void AddDataViewColumn(DataViewSchema.Builder builder) => throw new NotImplementedException();

/// <summary>
/// Appends a value to this <see cref="DataFrameColumn"/> using <paramref name="cursor"/>
/// </summary>
/// <param name="cursor">The row cursor which has the current position</param>
/// <param name="ValueGetter">The cached ValueGetter for this column.</param>
protected internal virtual void AddValueUsingCursor(DataViewRowCursor cursor, Delegate ValueGetter) => throw new NotImplementedException();

/// <summary>
/// Returns the ValueGetter for each active column in <paramref name="cursor"/> as a delegate to be cached.
/// </summary>
/// <param name="cursor">The row cursor which has the current position</param>
/// <param name="schemaColumn">The <see cref="DataViewSchema.Column"/> to return the ValueGetter for.</param>
protected internal virtual Delegate GetValueGetterUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn) => throw new NotImplementedException();

/// <summary>
/// Clamps values beyond the specified thresholds
/// </summary>
144 changes: 144 additions & 0 deletions src/Microsoft.Data.Analysis/IDataView.Extension.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System;
using System.Collections.Generic;
using Microsoft.Data.Analysis;
using Microsoft.ML.Data;

namespace Microsoft.ML
{
public static class IDataViewExtensions
{
private const int defaultMaxRows = 100;

/// <summary>
/// Returns a <see cref="Microsoft.Data.Analysis.DataFrame"/> from this <paramref name="dataView"/>.
/// </summary>
/// <param name="dataView">The current <see cref="IDataView"/>.</param>
/// <param name="maxRows">The max number or rows in the <see cref="Microsoft.Data.Analysis.DataFrame"/>. Defaults to 100. Use -1 to construct a DataFrame using all the rows in <paramref name="dataView"/>.</param>
/// <returns>A <see cref="Microsoft.Data.Analysis.DataFrame"/> with <paramref name="maxRows"/>.</returns>
public static DataFrame ToDataFrame(this IDataView dataView, long maxRows = defaultMaxRows)
{
return ToDataFrame(dataView, maxRows, null);
}

/// <summary>
/// Returns a <see cref="Microsoft.Data.Analysis.DataFrame"/> with the first 100 rows of this <paramref name="dataView"/>.
/// </summary>
/// <param name="dataView">The current <see cref="IDataView"/>.</param>
/// <param name="selectColumns">The columns selected for the resultant DataFrame</param>
/// <returns>A <see cref="Microsoft.Data.Analysis.DataFrame"/> with the selected columns and 100 rows.</returns>
public static DataFrame ToDataFrame(this IDataView dataView, params string[] selectColumns)
{
return ToDataFrame(dataView, defaultMaxRows, selectColumns);
}

/// <summary>
/// Returns a <see cref="Microsoft.Data.Analysis.DataFrame"/> with the first <paramref name="maxRows"/> of this <paramref name="dataView"/>.
/// </summary>
/// <param name="dataView">The current <see cref="IDataView"/>.</param>
/// <param name="maxRows">The max number or rows in the <see cref="Microsoft.Data.Analysis.DataFrame"/>. Use -1 to construct a DataFrame using all the rows in <paramref name="dataView"/>.</param>
/// <param name="selectColumns">The columns selected for the resultant DataFrame</param>
/// <returns>A <see cref="Microsoft.Data.Analysis.DataFrame"/> with the selected columns and <paramref name="maxRows"/> rows.</returns>
public static DataFrame ToDataFrame(this IDataView dataView, long maxRows, params string[] selectColumns)
{
DataViewSchema schema = dataView.Schema;
List<DataFrameColumn> dataFrameColumns = new List<DataFrameColumn>(schema.Count);
maxRows = maxRows == -1 ? long.MaxValue : maxRows;

HashSet<string> selectColumnsSet = null;
if (selectColumns != null && selectColumns.Length > 0)
{
selectColumnsSet = new HashSet<string>(selectColumns);
}

List<DataViewSchema.Column> activeDataViewColumns = new List<DataViewSchema.Column>();
foreach (DataViewSchema.Column dataViewColumn in schema)
{
if (dataViewColumn.IsHidden || (selectColumnsSet != null && !selectColumnsSet.Contains(dataViewColumn.Name)))
{
continue;
}

activeDataViewColumns.Add(dataViewColumn);
DataViewType type = dataViewColumn.Type;
if (type == BooleanDataViewType.Instance)
{
dataFrameColumns.Add(new BooleanDataFrameColumn(dataViewColumn.Name));
}
else if (type == NumberDataViewType.Byte)
{
dataFrameColumns.Add(new ByteDataFrameColumn(dataViewColumn.Name));
}
else if (type == NumberDataViewType.Double)
{
dataFrameColumns.Add(new DoubleDataFrameColumn(dataViewColumn.Name));
}
else if (type == NumberDataViewType.Single)
{
dataFrameColumns.Add(new SingleDataFrameColumn(dataViewColumn.Name));
}
else if (type == NumberDataViewType.Int32)
{
dataFrameColumns.Add(new Int32DataFrameColumn(dataViewColumn.Name));
}
else if (type == NumberDataViewType.Int64)
{
dataFrameColumns.Add(new Int64DataFrameColumn(dataViewColumn.Name));
}
else if (type == NumberDataViewType.SByte)
{
dataFrameColumns.Add(new SByteDataFrameColumn(dataViewColumn.Name));
}
else if (type == NumberDataViewType.Int16)
{
dataFrameColumns.Add(new Int16DataFrameColumn(dataViewColumn.Name));
}
else if (type == NumberDataViewType.UInt32)
{
dataFrameColumns.Add(new UInt32DataFrameColumn(dataViewColumn.Name));
}
else if (type == NumberDataViewType.UInt64)
{
dataFrameColumns.Add(new UInt64DataFrameColumn(dataViewColumn.Name));
}
else if (type == NumberDataViewType.UInt16)
{
dataFrameColumns.Add(new UInt16DataFrameColumn(dataViewColumn.Name));
}
else if (type == TextDataViewType.Instance)
{
dataFrameColumns.Add(new StringDataFrameColumn(dataViewColumn.Name));
}
else
{
throw new NotSupportedException(String.Format(Microsoft.Data.Strings.NotSupportedColumnType, type.RawType.Name));
Copy link
Author

@pgovind pgovind Mar 16, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will cause a problem for vector types in IDataView I think. We'd need to add support for vector columns in DataFrame to fix this. I'll open a bug

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

}
}

using (DataViewRowCursor cursor = dataView.GetRowCursor(activeDataViewColumns))
{
Delegate[] activeColumnDelegates = new Delegate[activeDataViewColumns.Count];
int columnIndex = 0;
foreach (DataViewSchema.Column activeDataViewColumn in activeDataViewColumns)
{
Delegate valueGetter = dataFrameColumns[columnIndex].GetValueGetterUsingCursor(cursor, activeDataViewColumn);
activeColumnDelegates[columnIndex] = valueGetter;
columnIndex++;
}
while (cursor.MoveNext() && cursor.Position < maxRows)
{
for (int i = 0; i < activeColumnDelegates.Length; i++)
{
dataFrameColumns[i].AddValueUsingCursor(cursor, activeColumnDelegates[i]);
}
}
}

return new DataFrame(dataFrameColumns);
}
}

}
26 changes: 26 additions & 0 deletions src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.cs
Original file line number Diff line number Diff line change
@@ -775,5 +775,31 @@ private static ValueGetter<ushort> CreateCharValueGetterDelegate(DataViewRowCurs

private static ValueGetter<double> CreateDecimalValueGetterDelegate(DataViewRowCursor cursor, PrimitiveDataFrameColumn<decimal> column) =>
(ref double value) => value = (double?)column[cursor.Position] ?? double.NaN;

protected internal override void AddValueUsingCursor(DataViewRowCursor cursor, Delegate getter)
{
long row = cursor.Position;
T value = default;
Debug.Assert(getter != null, "Excepted getter to be valid");
(getter as ValueGetter<T>)(ref value);

if (Length > row)
{
this[row] = value;
}
else if (Length == row)
{
Append(value);
}
else
{
throw new IndexOutOfRangeException(nameof(row));
}
}

protected internal override Delegate GetValueGetterUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn)
{
return cursor.GetGetter<T>(schemaColumn);
}
}
}
27 changes: 27 additions & 0 deletions src/Microsoft.Data.Analysis/StringDataFrameColumn.cs
Original file line number Diff line number Diff line change
@@ -467,5 +467,32 @@ protected internal override Delegate GetDataViewGetter(DataViewRowCursor cursor)

private ValueGetter<ReadOnlyMemory<char>> CreateValueGetterDelegate(DataViewRowCursor cursor) =>
(ref ReadOnlyMemory<char> value) => value = this[cursor.Position].AsMemory();

protected internal override void AddValueUsingCursor(DataViewRowCursor cursor, Delegate getter)
{
long row = cursor.Position;
ReadOnlyMemory<char> value = default;
Debug.Assert(getter != null, "Excepted getter to be valid");

(getter as ValueGetter<ReadOnlyMemory<char>>)(ref value);

if (Length > row)
{
this[row] = value.ToString();
}
else if (Length == row)
{
Append(value.ToString());
}
else
{
throw new IndexOutOfRangeException(nameof(row));
}
}

protected internal override Delegate GetValueGetterUsingCursor(DataViewRowCursor cursor, DataViewSchema.Column schemaColumn)
{
return cursor.GetGetter<ReadOnlyMemory<char>>(schemaColumn);
}
}
}
9 changes: 9 additions & 0 deletions src/Microsoft.Data.Analysis/strings.Designer.cs

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion src/Microsoft.Data.Analysis/strings.resx
Original file line number Diff line number Diff line change
@@ -183,10 +183,13 @@
<data name="NonSeekableStream" xml:space="preserve">
<value>Expected a seekable stream</value>
</data>
<data name="NotSupportedColumnType" xml:space="preserve">
<value>{0} is not a supported column type.</value>
</data>
<data name="NumericColumnType" xml:space="preserve">
<value>numeric column</value>
</data>
<data name="SpansMultipleBuffers" xml:space="preserve">
<value>Cannot span multiple buffers</value>
</data>
</root>
</root>
Loading