Skip to content

Make DataViewRowId not act like a number. #2707

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Feb 25, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 3 additions & 60 deletions src/Microsoft.Data.DataView/DataViewRowId.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
namespace Microsoft.Data.DataView
{
/// <summary>
/// A structure serving as a sixteen-byte unsigned integer. It is used as the row id of <see cref="IDataView"/>.
/// A structure serving as the identifier of a row of <see cref="IDataView"/>.
/// For datasets with millions of records, those IDs need to be unique, therefore the need for such a large structure to hold the values.
/// Those Ids are derived from other Ids of the previous components of the pipelines, and dividing the structure in two: high order and low order of bits,
/// and reduces the changes of those collisions even further.
Expand Down Expand Up @@ -53,70 +53,13 @@ public bool Equals(DataViewRowId other)

public override bool Equals(object obj)
{
if (obj != null && obj is DataViewRowId)
if (obj is DataViewRowId other)
{
var item = (DataViewRowId)obj;
return Equals(item);
return Equals(other);
}
return false;
}

public static DataViewRowId operator +(DataViewRowId first, ulong second)
{
ulong resHi = first.High;
ulong resLo = first.Low + second;
if (resLo < second)
resHi++;
return new DataViewRowId(resLo, resHi);
}

public static DataViewRowId operator -(DataViewRowId first, ulong second)
{
ulong resHi = first.High;
ulong resLo = first.Low - second;
if (resLo > first.Low)
resHi--;
return new DataViewRowId(resLo, resHi);
}

public static bool operator ==(DataViewRowId first, ulong second)
{
return first.High == 0 && first.Low == second;
}

public static bool operator !=(DataViewRowId first, ulong second)
{
return !(first == second);
}

public static bool operator <(DataViewRowId first, ulong second)
{
return first.High == 0 && first.Low < second;
}

public static bool operator >(DataViewRowId first, ulong second)
{
return first.High > 0 || first.Low > second;
}

public static bool operator <=(DataViewRowId first, ulong second)
{
return first.High == 0 && first.Low <= second;
}

public static bool operator >=(DataViewRowId first, ulong second)
{
return first.High > 0 || first.Low >= second;
}

public static explicit operator double(DataViewRowId x)
{
// REVIEW: The 64-bit JIT has a bug where rounding might be not quite
// correct when converting a ulong to double with the high bit set. Should we
// care and compensate? See the DoubleParser code for a work-around.
return x.High * ((double)(1UL << 32) * (1UL << 32)) + x.Low;
}

public override int GetHashCode()
{
return (int)(
Expand Down
46 changes: 35 additions & 11 deletions src/Microsoft.Data.DataView/DataViewType.cs
Original file line number Diff line number Diff line change
Expand Up @@ -199,17 +199,6 @@ public static NumberDataViewType UInt64
}
}

private static volatile NumberDataViewType _instDataViewRowId;
public static NumberDataViewType DataViewRowId
{
get
{
return _instDataViewRowId ??
Interlocked.CompareExchange(ref _instDataViewRowId, new NumberDataViewType(typeof(DataViewRowId), "UG"), null) ??
_instDataViewRowId;
}
}

private static volatile NumberDataViewType _instSingle;
public static NumberDataViewType Single
{
Expand Down Expand Up @@ -243,6 +232,41 @@ public override bool Equals(DataViewType other)
public override string ToString() => _name;
}

/// <summary>
/// The DataViewRowId type.
/// </summary>
public sealed class RowIdDataViewType : PrimitiveDataViewType
{
private static volatile RowIdDataViewType _instance;
public static RowIdDataViewType Instance
{
get
{
return _instance ??
Interlocked.CompareExchange(ref _instance, new RowIdDataViewType(), null) ??
Copy link
Contributor

@rogancarr rogancarr Feb 25, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interlocked.CompareExchange [](start = 20, length = 27)

Nice! #Resolved

_instance;
}
}

private RowIdDataViewType()
: base(typeof(DataViewRowId))
{
}

public override bool Equals(DataViewType other)
{
if (other == this)
return true;
Debug.Assert(!(other is RowIdDataViewType));
return false;
}

public override string ToString()
{
return "DataViewRowId";
}
}

/// <summary>
/// The standard boolean type.
/// </summary>
Expand Down
11 changes: 7 additions & 4 deletions src/Microsoft.ML.Core/Data/ColumnTypeExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ internal static class ColumnTypeExtensions
/// </summary>
public static bool IsStandardScalar(this DataViewType columnType) =>
(columnType is NumberDataViewType) || (columnType is TextDataViewType) || (columnType is BooleanDataViewType) ||
(columnType is TimeSpanDataViewType) || (columnType is DateTimeDataViewType) || (columnType is DateTimeOffsetDataViewType);
(columnType is RowIdDataViewType) || (columnType is TimeSpanDataViewType) ||
(columnType is DateTimeDataViewType) || (columnType is DateTimeOffsetDataViewType);

/// <summary>
/// Zero return means it's not a key type.
Expand Down Expand Up @@ -103,6 +104,8 @@ public static PrimitiveDataViewType PrimitiveTypeFromType(Type type)
return DateTimeDataViewType.Instance;
if (type == typeof(DateTimeOffset))
return DateTimeOffsetDataViewType.Instance;
if (type == typeof(DataViewRowId))
return RowIdDataViewType.Instance;
return NumberTypeFromType(type);
}

Expand All @@ -118,6 +121,8 @@ public static PrimitiveDataViewType PrimitiveTypeFromKind(InternalDataKind kind)
return DateTimeDataViewType.Instance;
if (kind == InternalDataKind.DZ)
return DateTimeOffsetDataViewType.Instance;
if (kind == InternalDataKind.UG)
return RowIdDataViewType.Instance;
return NumberTypeFromKind(kind);
}

Expand All @@ -131,7 +136,7 @@ public static NumberDataViewType NumberTypeFromType(Type type)
throw new InvalidOperationException($"Bad type in {nameof(ColumnTypeExtensions)}.{nameof(NumberTypeFromType)}: {type}");
}

public static NumberDataViewType NumberTypeFromKind(InternalDataKind kind)
private static NumberDataViewType NumberTypeFromKind(InternalDataKind kind)
{
switch (kind)
{
Expand All @@ -155,8 +160,6 @@ public static NumberDataViewType NumberTypeFromKind(InternalDataKind kind)
return NumberDataViewType.Single;
case InternalDataKind.R8:
return NumberDataViewType.Double;
case InternalDataKind.UG:
return NumberDataViewType.DataViewRowId;
}

Contracts.Assert(false);
Expand Down
4 changes: 2 additions & 2 deletions src/Microsoft.ML.Data/Data/DataViewUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,7 @@ private static DataViewRowCursor ConsolidateCore(IChannelProvider provider, Data
outPipes[i] = OutPipe.Create(type, pool);
}
int idIdx = activeToCol.Length + (int)ExtraIndex.Id;
outPipes[idIdx] = OutPipe.Create(NumberDataViewType.DataViewRowId, GetPool(NumberDataViewType.DataViewRowId, ourPools, idIdx));
outPipes[idIdx] = OutPipe.Create(RowIdDataViewType.Instance, GetPool(RowIdDataViewType.Instance, ourPools, idIdx));

// Create the structures to synchronize between the workers and the consumer.
const int toConsumeBound = 4;
Expand Down Expand Up @@ -553,7 +553,7 @@ private DataViewRowCursor[] SplitCore(IChannelProvider ch, DataViewRowCursor inp
int idIdx = activeToCol.Length + (int)ExtraIndex.Id;
inPipes[idIdx] = CreateIdInPipe(input);
for (int i = 0; i < cthd; ++i)
outPipes[i][idIdx] = inPipes[idIdx].CreateOutPipe(NumberDataViewType.DataViewRowId);
outPipes[i][idIdx] = inPipes[idIdx].CreateOutPipe(RowIdDataViewType.Instance);

var toConsume = new BlockingCollection<Batch>(toConsumeBound);
var batchColumnPool = new MadeObjectPool<BatchColumn[]>(() => new BatchColumn[inPipes.Length]);
Expand Down
4 changes: 3 additions & 1 deletion src/Microsoft.ML.Data/DataLoadSave/Binary/Codecs.cs
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,9 @@ private sealed class UnsafeTypeCodec<T> : SimpleCodec<T> where T : struct
// Throws an exception if T is neither a TimeSpan nor a NumberType.
private static DataViewType UnsafeColumnType(Type type)
{
return type == typeof(TimeSpan) ? (DataViewType)TimeSpanDataViewType.Instance : ColumnTypeExtensions.NumberTypeFromType(type);
return type == typeof(TimeSpan) ? TimeSpanDataViewType.Instance :
type == typeof(DataViewRowId) ? (DataViewType)RowIdDataViewType.Instance :
ColumnTypeExtensions.NumberTypeFromType(type);
}

public UnsafeTypeCodec(CodecFactory factory)
Expand Down
5 changes: 3 additions & 2 deletions src/Microsoft.ML.Data/Transforms/Hashing.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1213,10 +1213,11 @@ internal void Save(ModelSaveContext ctx)
internal static bool IsColumnTypeValid(DataViewType type)
{
var itemType = type.GetItemType();
return itemType is TextDataViewType || itemType is KeyType || itemType is NumberDataViewType || itemType is BooleanDataViewType;
return itemType is TextDataViewType || itemType is KeyType || itemType is NumberDataViewType ||
itemType is BooleanDataViewType || itemType is RowIdDataViewType;
}

internal const string ExpectedColumnType = "Expected Text, Key, numeric or Boolean item type";
internal const string ExpectedColumnType = "Expected Text, Key, numeric, Boolean or DataViewRowId item type";

/// <summary>
/// Initializes a new instance of <see cref="HashingEstimator"/>.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -529,7 +529,7 @@ public Cursor(IChannelProvider provider, int poolRows, DataViewRowCursor input,
input.Schema[c].Type, RowCursorUtils.GetGetterAsDelegate(input, c));
_getters[ia] = CreateGetterDelegate(c);
}
var idPipe = _pipes[numActive + (int)ExtraIndex.Id] = ShufflePipe.Create(_pipeIndices.Length, NumberDataViewType.DataViewRowId, input.GetIdGetter());
var idPipe = _pipes[numActive + (int)ExtraIndex.Id] = ShufflePipe.Create(_pipeIndices.Length, RowIdDataViewType.Instance, input.GetIdGetter());
_idGetter = CreateGetterDelegate<DataViewRowId>(idPipe);
// Initially, after the preamble to MoveNextCore, we want:
// liveCount=0, deadCount=0, circularIndex=0. So we set these
Expand Down
2 changes: 1 addition & 1 deletion src/Microsoft.ML.Parquet/ParquetLoader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,7 @@ private DataViewType ConvertFieldType(DataType parquetType)
case DataType.Int64:
return NumberDataViewType.Int64;
case DataType.Int96:
return NumberDataViewType.DataViewRowId;
return RowIdDataViewType.Instance;
case DataType.ByteArray:
return new VectorType(NumberDataViewType.Byte);
case DataType.String:
Expand Down
4 changes: 2 additions & 2 deletions src/Microsoft.ML.StaticPipe/StaticSchemaShape.cs
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ private static Type GetTypeOrNull(SchemaShape.Column col)
if (physType != null && (
pt == NumberDataViewType.SByte || pt == NumberDataViewType.Int16 || pt == NumberDataViewType.Int32 || pt == NumberDataViewType.Int32 ||
pt == NumberDataViewType.Byte || pt == NumberDataViewType.UInt16 || pt == NumberDataViewType.UInt32 || pt == NumberDataViewType.UInt32 ||
pt == NumberDataViewType.Single || pt == NumberDataViewType.Double || pt == NumberDataViewType.DataViewRowId || pt == BooleanDataViewType.Instance ||
pt == NumberDataViewType.Single || pt == NumberDataViewType.Double || pt == RowIdDataViewType.Instance || pt == BooleanDataViewType.Instance ||
pt == DateTimeDataViewType.Instance || pt == DateTimeOffsetDataViewType.Instance || pt == TimeSpanDataViewType.Instance ||
pt == TextDataViewType.Instance))
{
Expand Down Expand Up @@ -311,7 +311,7 @@ private static Type GetTypeOrNull(DataViewSchema.Column col)
if (physType != null && (
pt == NumberDataViewType.SByte || pt == NumberDataViewType.Int16 || pt == NumberDataViewType.Int32 || pt == NumberDataViewType.Int64 ||
pt == NumberDataViewType.Byte || pt == NumberDataViewType.UInt16 || pt == NumberDataViewType.UInt32 || pt == NumberDataViewType.UInt64 ||
pt == NumberDataViewType.Single || pt == NumberDataViewType.Double || pt == NumberDataViewType.DataViewRowId || pt == BooleanDataViewType.Instance ||
pt == NumberDataViewType.Single || pt == NumberDataViewType.Double || pt == RowIdDataViewType.Instance || pt == BooleanDataViewType.Instance ||
pt == DateTimeDataViewType.Instance || pt == DateTimeOffsetDataViewType.Instance || pt == TimeSpanDataViewType.Instance ||
pt == TextDataViewType.Instance))
{
Expand Down
Loading