Skip to content

Don't fail in case of const field in Collection source and extended support for type conversion #555

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions src/Microsoft.ML.Api/ApiUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
using System.Reflection;
using System.Reflection.Emit;
using Microsoft.ML.Runtime.Data;
using Microsoft.ML.Runtime.Internal.Utilities;

namespace Microsoft.ML.Runtime.Api
{
Expand All @@ -19,11 +18,12 @@ internal static class ApiUtils
private static OpCode GetAssignmentOpCode(Type t)
{
// REVIEW: This should be a Dictionary<Type, OpCode> based solution.
// DvTexts, strings, arrays, and VBuffers.
// DvTypes, strings, arrays, all nullable types, VBuffers and UInt128.
if (t == typeof(DvInt8) || t == typeof(DvInt4) || t == typeof(DvInt2) || t == typeof(DvInt1) ||
t == typeof(DvBool) || t==typeof(bool?) || t == typeof(DvText) || t == typeof(string) || t.IsArray ||
(t.IsGenericType && t.GetGenericTypeDefinition() == typeof(VBuffer<>)) || t == typeof(DvDateTime) ||
t == typeof(DvDateTimeZone) || t == typeof(DvTimeSpan) || t == typeof(UInt128))
t == typeof(DvBool) || t == typeof(DvText) || t == typeof(string) || t.IsArray ||
(t.IsGenericType && t.GetGenericTypeDefinition() == typeof(VBuffer<>)) ||
(t.IsGenericType && t.GetGenericTypeDefinition() == typeof(Nullable<>)) ||
t == typeof(DvDateTime) || t == typeof(DvDateTimeZone) || t == typeof(DvTimeSpan) || t == typeof(UInt128))
{
return OpCodes.Stobj;
}
Expand Down
207 changes: 146 additions & 61 deletions src/Microsoft.ML.Api/DataViewConstructionUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ private Delegate CreateGetter(int index)

var column = DataView._schema.SchemaDefn.Columns[index];
var outputType = column.IsComputed ? column.ReturnType : column.FieldInfo.FieldType;

var genericType = outputType;
Func<int, Delegate> del;

if (outputType.IsArray)
Expand All @@ -129,11 +129,66 @@ private Delegate CreateGetter(int index)
if (outputType.GetElementType() == typeof(string))
{
Ch.Assert(colType.ItemType.IsText);
return CreateStringArrayToVBufferGetter(index);
return CreateConvertingArrayGetterDelegate<String, DvText>(index, x => x == null ? DvText.NA : new DvText(x));
}
else if (outputType.GetElementType() == typeof(int))
{
Ch.Assert(colType.ItemType == NumberType.I4);
return CreateConvertingArrayGetterDelegate<int, DvInt4>(index, x => x);
}
else if (outputType.GetElementType() == typeof(int?))
{
Ch.Assert(colType.ItemType == NumberType.I4);
return CreateConvertingArrayGetterDelegate<int?, DvInt4>(index, x => x ?? DvInt4.NA);
}
else if (outputType.GetElementType() == typeof(long))
{
Ch.Assert(colType.ItemType == NumberType.I8);
return CreateConvertingArrayGetterDelegate<long, DvInt8>(index, x => x);
}
else if (outputType.GetElementType() == typeof(long?))
{
Ch.Assert(colType.ItemType == NumberType.I8);
return CreateConvertingArrayGetterDelegate<long?, DvInt8>(index, x => x ?? DvInt8.NA);
}
else if (outputType.GetElementType() == typeof(short))
{
Ch.Assert(colType.ItemType == NumberType.I2);
return CreateConvertingArrayGetterDelegate<short, DvInt2>(index, x => x);
}
else if (outputType.GetElementType() == typeof(short?))
{
Ch.Assert(colType.ItemType == NumberType.I2);
return CreateConvertingArrayGetterDelegate<short?, DvInt2>(index, x => x ?? DvInt2.NA);
}
else if (outputType.GetElementType() == typeof(sbyte))
{
Ch.Assert(colType.ItemType == NumberType.I1);
return CreateConvertingArrayGetterDelegate<sbyte, DvInt1>(index, x => x);
}
else if (outputType.GetElementType() == typeof(sbyte?))
{
Ch.Assert(colType.ItemType == NumberType.I1);
return CreateConvertingArrayGetterDelegate<sbyte?, DvInt1>(index, x => x ?? DvInt1.NA);
}
else if (outputType.GetElementType() == typeof(bool))
{
Ch.Assert(colType.ItemType.IsBool);
return CreateConvertingArrayGetterDelegate<bool, DvBool>(index, x => x);
}
else if (outputType.GetElementType() == typeof(bool?))
{
Ch.Assert(colType.ItemType.IsBool);
return CreateConvertingArrayGetterDelegate<bool?, DvBool>(index, x => x ?? DvBool.NA);
}

// T[] -> VBuffer<T>
Ch.Assert(outputType.GetElementType() == colType.ItemType.RawType);
del = CreateArrayToVBufferGetter<int>;
if (outputType.GetElementType().IsGenericType && outputType.GetElementType().GetGenericTypeDefinition() == typeof(Nullable<>))
Ch.Assert(Nullable.GetUnderlyingType(outputType.GetElementType()) == colType.ItemType.RawType);
else
Ch.Assert(outputType.GetElementType() == colType.ItemType.RawType);
del = CreateDirectArrayGetterDelegate<int>;
genericType = outputType.GetElementType();
}
else if (colType.IsVector)
{
Expand All @@ -142,99 +197,126 @@ private Delegate CreateGetter(int index)
Ch.Assert(outputType.IsGenericType);
Ch.Assert(outputType.GetGenericTypeDefinition() == typeof(VBuffer<>));
Ch.Assert(outputType.GetGenericArguments()[0] == colType.ItemType.RawType);
del = CreateVBufferToVBufferDelegate<int>;
del = CreateDirectVBufferGetterDelegate<int>;
}
else if (colType.IsPrimitive)
{
if (outputType == typeof(string))
{
// String -> DvText
Ch.Assert(colType.IsText);
return CreateStringToTextGetter(index);
return CreateConvertingGetterDelegate<String, DvText>(index, x => x == null ? DvText.NA : new DvText(x));
}
else if (outputType == typeof(bool))
{
// Bool -> DvBool
Ch.Assert(colType.IsBool);
return CreateBooleanToDvBoolGetter(index);
return CreateConvertingGetterDelegate<bool, DvBool>(index, x => x);
}
else if (outputType == typeof(bool?))
{
// Bool? -> DvBool
Ch.Assert(colType.IsBool);
return CreateNullableBooleanToDvBoolGetter(index);
return CreateConvertingGetterDelegate<bool?, DvBool>(index, x => x ?? DvBool.NA);
}
else if (outputType == typeof(int))
{
// int -> DvInt4
Ch.Assert(colType == NumberType.I4);
return CreateConvertingGetterDelegate<int, DvInt4>(index, x => x);
}
else if (outputType == typeof(int?))
{
// int? -> DvInt4
Ch.Assert(colType == NumberType.I4);
return CreateConvertingGetterDelegate<int?, DvInt4>(index, x => x ?? DvInt4.NA);
}
else if (outputType == typeof(short))
{
// short -> DvInt2
Ch.Assert(colType == NumberType.I2);
return CreateConvertingGetterDelegate<short, DvInt2>(index, x => x);
}
else if (outputType == typeof(short?))
{
// short? -> DvInt2
Ch.Assert(colType == NumberType.I2);
return CreateConvertingGetterDelegate<short?, DvInt2>(index, x => x ?? DvInt2.NA);
}
else if (outputType == typeof(long))
{
// long -> DvInt8
Ch.Assert(colType == NumberType.I8);
return CreateConvertingGetterDelegate<long, DvInt8>(index, x => x);
}
else if (outputType == typeof(long?))
{
// long? -> DvInt8
Ch.Assert(colType == NumberType.I8);
return CreateConvertingGetterDelegate<long?, DvInt8>(index, x => x ?? DvInt8.NA);
}
else if (outputType == typeof(sbyte))
{
// sbyte -> DvInt1
Ch.Assert(colType == NumberType.I1);
return CreateConvertingGetterDelegate<sbyte, DvInt1>(index, x => x);
}
else if (outputType == typeof(sbyte?))
{
// sbyte? -> DvInt1
Ch.Assert(colType == NumberType.I1);
return CreateConvertingGetterDelegate<sbyte?, DvInt1>(index, x => x ?? DvInt1.NA);
}

// T -> T
Ch.Assert(colType.RawType == outputType);
del = CreateDirectGetter<int>;
if (outputType.IsGenericType && outputType.GetGenericTypeDefinition() == typeof(Nullable<>))
Ch.Assert(colType.RawType == Nullable.GetUnderlyingType(outputType));
else
Ch.Assert(colType.RawType == outputType);
del = CreateDirectGetterDelegate<int>;
}
else
{
// REVIEW: Is this even possible?
throw Ch.ExceptNotImpl("Type '{0}' is not yet supported.", outputType.FullName);
}
MethodInfo meth =
del.GetMethodInfo().GetGenericMethodDefinition().MakeGenericMethod(colType.ItemType.RawType);
del.GetMethodInfo().GetGenericMethodDefinition().MakeGenericMethod(genericType);
return (Delegate)meth.Invoke(this, new object[] { index });
}

private Delegate CreateStringArrayToVBufferGetter(int index)
// REVIEW: The converting getter invokes a type conversion delegate on every call, so it's inherently slower
// than the 'direct' getter. We don't have good indication of this to the user, and the selection
// of affected types is pretty arbitrary (signed integers and bools, but not uints and floats).
private Delegate CreateConvertingArrayGetterDelegate<TSrc, TDst>(int index, Func<TSrc, TDst> convert)
{
var peek = DataView._peeks[index] as Peek<TRow, string[]>;
var peek = DataView._peeks[index] as Peek<TRow, TSrc[]>;
Ch.AssertValue(peek);

string[] buf = null;

return (ValueGetter<VBuffer<DvText>>)((ref VBuffer<DvText> dst) =>
TSrc[] buf = default;
return (ValueGetter<VBuffer<TDst>>)((ref VBuffer<TDst> dst) =>
{
peek(GetCurrentRowObject(), Position, ref buf);
var n = Utils.Size(buf);
dst = new VBuffer<DvText>(n, Utils.Size(dst.Values) < n
? new DvText[n]
dst = new VBuffer<TDst>(n, Utils.Size(dst.Values) < n
? new TDst[n]
: dst.Values, dst.Indices);
for (int i = 0; i < n; i++)
dst.Values[i] = new DvText(buf[i]);
});
}

private Delegate CreateStringToTextGetter(int index)
{
var peek = DataView._peeks[index] as Peek<TRow, string>;
Ch.AssertValue(peek);
string buf = null;
return (ValueGetter<DvText>)((ref DvText dst) =>
{
peek(GetCurrentRowObject(), Position, ref buf);
dst = new DvText(buf);
});
}

private Delegate CreateBooleanToDvBoolGetter(int index)
{
var peek = DataView._peeks[index] as Peek<TRow, bool>;
Ch.AssertValue(peek);
bool buf = false;
return (ValueGetter<DvBool>)((ref DvBool dst) =>
{
peek(GetCurrentRowObject(), Position, ref buf);
dst = (DvBool)buf;
dst.Values[i] = convert(buf[i]);
});
}

private Delegate CreateNullableBooleanToDvBoolGetter(int index)
private Delegate CreateConvertingGetterDelegate<TSrc, TDst>(int index, Func<TSrc, TDst> convert)
{
var peek = DataView._peeks[index] as Peek<TRow, bool?>;
var peek = DataView._peeks[index] as Peek<TRow, TSrc>;
Ch.AssertValue(peek);
bool? buf = null;
return (ValueGetter<DvBool>)((ref DvBool dst) =>
TSrc buf = default;
return (ValueGetter<TDst>)((ref TDst dst) =>
{
peek(GetCurrentRowObject(), Position, ref buf);
dst = buf.HasValue ? (DvBool)buf.Value : DvBool.NA;
dst = convert(buf);
Copy link
Contributor

@Zruty0 Zruty0 Jul 19, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

convert [](start = 30, length = 7)

If convert is an identity, we are still incurring a delegate method call here, which is not efficient. #Closed

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't have identity converts. even if I pass is x=>x I convert from TSrc to TDst class and call implicit converter. Which I'm not sure how to express through generic classes.
I would like to get rid of them, I just don't know way how to do it.


In reply to: 203875078 [](ancestors = 203875078)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh I see now. Ok, why don't we call this CreateConvertingGetterDelegate, to make it clear that there's also CreateDirectGetterDelegate?


In reply to: 203876308 [](ancestors = 203876308,203875078)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I seem to recall this conversation with Shon, that any 'helper' mechanism that would introduce these expensive conversions better doesn't exist at all.
I don't agree with it, but it's something to consider: with all but no warning to the user, the pipeline with int will be slower than the pipeline with float because of the conversion involved.

This is part of a conversation about why we even have DvInts.

Maybe a REVIEW comment alongside this converting getter/setter would be in order?

REVIEW: the converting getter invokes a type conversion delegate on every call, so it's inherently slower than the 'direct' getter. We don't have good indication of this to the user, and the selection of affected types is pretty arbitrary (signed integers and bools, but not uints and floats).

In reply to: 203877031 [](ancestors = 203877031,203876308,203875078)

});
}

private Delegate CreateArrayToVBufferGetter<TDst>(int index)
private Delegate CreateDirectArrayGetterDelegate<TDst>(int index)
{
var peek = DataView._peeks[index] as Peek<TRow, TDst[]>;
Ch.AssertValue(peek);
Expand All @@ -250,26 +332,29 @@ private Delegate CreateArrayToVBufferGetter<TDst>(int index)
});
}

private Delegate CreateVBufferToVBufferDelegate<TDst>(int index)
private Delegate CreateDirectVBufferGetterDelegate<TDst>(int index)
{
var peek = DataView._peeks[index] as Peek<TRow, VBuffer<TDst>>;
Ch.AssertValue(peek);
VBuffer<TDst> buf = default(VBuffer<TDst>);
return (ValueGetter<VBuffer<TDst>>)((ref VBuffer<TDst> dst) =>
{
// The peek for a VBuffer is just a simple assignment, so there is
// no copy going on in the peek, so we must do that as a second
// step to the destination.
peek(GetCurrentRowObject(), Position, ref buf);
buf.CopyTo(ref dst);
});
{
// The peek for a VBuffer is just a simple assignment, so there is
// no copy going on in the peek, so we must do that as a second
// step to the destination.
peek(GetCurrentRowObject(), Position, ref buf);
buf.CopyTo(ref dst);
});
}

private Delegate CreateDirectGetter<TDst>(int index)
private Delegate CreateDirectGetterDelegate<TDst>(int index)
{
var peek = DataView._peeks[index] as Peek<TRow, TDst>;
Ch.AssertValue(peek);
return (ValueGetter<TDst>)((ref TDst dst) => { peek(GetCurrentRowObject(), Position, ref dst); });
return (ValueGetter<TDst>)((ref TDst dst) =>
{
peek(GetCurrentRowObject(), Position, ref dst);
});
}

protected abstract TRow GetCurrentRowObject();
Expand Down
3 changes: 3 additions & 0 deletions src/Microsoft.ML.Api/SchemaDefinition.cs
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,9 @@ public static SchemaDefinition Create(Type userType)
// of later at cursor creation.
if (fieldInfo.FieldType == typeof(IChannel))
continue;
// Const fields do not need to be mapped.
if (fieldInfo.IsLiteral)
continue;

if (fieldInfo.GetCustomAttribute<NoColumnAttribute>() != null)
continue;
Expand Down
Loading