Skip to content

Commit f6d55f3

Browse files
authored
Make DataViewRowId not act like a number. (#2707)
* Make DataViewRowId not act like a number. - Remove it from the NumberDataViewType. - Remove any method/operator that makes it feel like a number. Working towards #2297
1 parent a16eb30 commit f6d55f3

File tree

13 files changed

+175
-161
lines changed

13 files changed

+175
-161
lines changed

src/Microsoft.Data.DataView/DataViewRowId.cs

+3-60
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
namespace Microsoft.Data.DataView
1010
{
1111
/// <summary>
12-
/// A structure serving as a sixteen-byte unsigned integer. It is used as the row id of <see cref="IDataView"/>.
12+
/// A structure serving as the identifier of a row of <see cref="IDataView"/>.
1313
/// For datasets with millions of records, those IDs need to be unique, therefore the need for such a large structure to hold the values.
1414
/// Those Ids are derived from other Ids of the previous components of the pipelines, and dividing the structure in two: high order and low order of bits,
1515
/// and reduces the changes of those collisions even further.
@@ -53,70 +53,13 @@ public bool Equals(DataViewRowId other)
5353

5454
public override bool Equals(object obj)
5555
{
56-
if (obj != null && obj is DataViewRowId)
56+
if (obj is DataViewRowId other)
5757
{
58-
var item = (DataViewRowId)obj;
59-
return Equals(item);
58+
return Equals(other);
6059
}
6160
return false;
6261
}
6362

64-
public static DataViewRowId operator +(DataViewRowId first, ulong second)
65-
{
66-
ulong resHi = first.High;
67-
ulong resLo = first.Low + second;
68-
if (resLo < second)
69-
resHi++;
70-
return new DataViewRowId(resLo, resHi);
71-
}
72-
73-
public static DataViewRowId operator -(DataViewRowId first, ulong second)
74-
{
75-
ulong resHi = first.High;
76-
ulong resLo = first.Low - second;
77-
if (resLo > first.Low)
78-
resHi--;
79-
return new DataViewRowId(resLo, resHi);
80-
}
81-
82-
public static bool operator ==(DataViewRowId first, ulong second)
83-
{
84-
return first.High == 0 && first.Low == second;
85-
}
86-
87-
public static bool operator !=(DataViewRowId first, ulong second)
88-
{
89-
return !(first == second);
90-
}
91-
92-
public static bool operator <(DataViewRowId first, ulong second)
93-
{
94-
return first.High == 0 && first.Low < second;
95-
}
96-
97-
public static bool operator >(DataViewRowId first, ulong second)
98-
{
99-
return first.High > 0 || first.Low > second;
100-
}
101-
102-
public static bool operator <=(DataViewRowId first, ulong second)
103-
{
104-
return first.High == 0 && first.Low <= second;
105-
}
106-
107-
public static bool operator >=(DataViewRowId first, ulong second)
108-
{
109-
return first.High > 0 || first.Low >= second;
110-
}
111-
112-
public static explicit operator double(DataViewRowId x)
113-
{
114-
// REVIEW: The 64-bit JIT has a bug where rounding might be not quite
115-
// correct when converting a ulong to double with the high bit set. Should we
116-
// care and compensate? See the DoubleParser code for a work-around.
117-
return x.High * ((double)(1UL << 32) * (1UL << 32)) + x.Low;
118-
}
119-
12063
public override int GetHashCode()
12164
{
12265
return (int)(

src/Microsoft.Data.DataView/DataViewType.cs

+35-11
Original file line numberDiff line numberDiff line change
@@ -199,17 +199,6 @@ public static NumberDataViewType UInt64
199199
}
200200
}
201201

202-
private static volatile NumberDataViewType _instDataViewRowId;
203-
public static NumberDataViewType DataViewRowId
204-
{
205-
get
206-
{
207-
return _instDataViewRowId ??
208-
Interlocked.CompareExchange(ref _instDataViewRowId, new NumberDataViewType(typeof(DataViewRowId), "UG"), null) ??
209-
_instDataViewRowId;
210-
}
211-
}
212-
213202
private static volatile NumberDataViewType _instSingle;
214203
public static NumberDataViewType Single
215204
{
@@ -243,6 +232,41 @@ public override bool Equals(DataViewType other)
243232
public override string ToString() => _name;
244233
}
245234

235+
/// <summary>
236+
/// The DataViewRowId type.
237+
/// </summary>
238+
public sealed class RowIdDataViewType : PrimitiveDataViewType
239+
{
240+
private static volatile RowIdDataViewType _instance;
241+
public static RowIdDataViewType Instance
242+
{
243+
get
244+
{
245+
return _instance ??
246+
Interlocked.CompareExchange(ref _instance, new RowIdDataViewType(), null) ??
247+
_instance;
248+
}
249+
}
250+
251+
private RowIdDataViewType()
252+
: base(typeof(DataViewRowId))
253+
{
254+
}
255+
256+
public override bool Equals(DataViewType other)
257+
{
258+
if (other == this)
259+
return true;
260+
Debug.Assert(!(other is RowIdDataViewType));
261+
return false;
262+
}
263+
264+
public override string ToString()
265+
{
266+
return "DataViewRowId";
267+
}
268+
}
269+
246270
/// <summary>
247271
/// The standard boolean type.
248272
/// </summary>

src/Microsoft.ML.Core/Data/ColumnTypeExtensions.cs

+7-4
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@ internal static class ColumnTypeExtensions
1919
/// </summary>
2020
public static bool IsStandardScalar(this DataViewType columnType) =>
2121
(columnType is NumberDataViewType) || (columnType is TextDataViewType) || (columnType is BooleanDataViewType) ||
22-
(columnType is TimeSpanDataViewType) || (columnType is DateTimeDataViewType) || (columnType is DateTimeOffsetDataViewType);
22+
(columnType is RowIdDataViewType) || (columnType is TimeSpanDataViewType) ||
23+
(columnType is DateTimeDataViewType) || (columnType is DateTimeOffsetDataViewType);
2324

2425
/// <summary>
2526
/// Zero return means it's not a key type.
@@ -103,6 +104,8 @@ public static PrimitiveDataViewType PrimitiveTypeFromType(Type type)
103104
return DateTimeDataViewType.Instance;
104105
if (type == typeof(DateTimeOffset))
105106
return DateTimeOffsetDataViewType.Instance;
107+
if (type == typeof(DataViewRowId))
108+
return RowIdDataViewType.Instance;
106109
return NumberTypeFromType(type);
107110
}
108111

@@ -118,6 +121,8 @@ public static PrimitiveDataViewType PrimitiveTypeFromKind(InternalDataKind kind)
118121
return DateTimeDataViewType.Instance;
119122
if (kind == InternalDataKind.DZ)
120123
return DateTimeOffsetDataViewType.Instance;
124+
if (kind == InternalDataKind.UG)
125+
return RowIdDataViewType.Instance;
121126
return NumberTypeFromKind(kind);
122127
}
123128

@@ -131,7 +136,7 @@ public static NumberDataViewType NumberTypeFromType(Type type)
131136
throw new InvalidOperationException($"Bad type in {nameof(ColumnTypeExtensions)}.{nameof(NumberTypeFromType)}: {type}");
132137
}
133138

134-
public static NumberDataViewType NumberTypeFromKind(InternalDataKind kind)
139+
private static NumberDataViewType NumberTypeFromKind(InternalDataKind kind)
135140
{
136141
switch (kind)
137142
{
@@ -155,8 +160,6 @@ public static NumberDataViewType NumberTypeFromKind(InternalDataKind kind)
155160
return NumberDataViewType.Single;
156161
case InternalDataKind.R8:
157162
return NumberDataViewType.Double;
158-
case InternalDataKind.UG:
159-
return NumberDataViewType.DataViewRowId;
160163
}
161164

162165
Contracts.Assert(false);

src/Microsoft.ML.Data/Data/DataViewUtils.cs

+2-2
Original file line numberDiff line numberDiff line change
@@ -357,7 +357,7 @@ private static DataViewRowCursor ConsolidateCore(IChannelProvider provider, Data
357357
outPipes[i] = OutPipe.Create(type, pool);
358358
}
359359
int idIdx = activeToCol.Length + (int)ExtraIndex.Id;
360-
outPipes[idIdx] = OutPipe.Create(NumberDataViewType.DataViewRowId, GetPool(NumberDataViewType.DataViewRowId, ourPools, idIdx));
360+
outPipes[idIdx] = OutPipe.Create(RowIdDataViewType.Instance, GetPool(RowIdDataViewType.Instance, ourPools, idIdx));
361361

362362
// Create the structures to synchronize between the workers and the consumer.
363363
const int toConsumeBound = 4;
@@ -553,7 +553,7 @@ private DataViewRowCursor[] SplitCore(IChannelProvider ch, DataViewRowCursor inp
553553
int idIdx = activeToCol.Length + (int)ExtraIndex.Id;
554554
inPipes[idIdx] = CreateIdInPipe(input);
555555
for (int i = 0; i < cthd; ++i)
556-
outPipes[i][idIdx] = inPipes[idIdx].CreateOutPipe(NumberDataViewType.DataViewRowId);
556+
outPipes[i][idIdx] = inPipes[idIdx].CreateOutPipe(RowIdDataViewType.Instance);
557557

558558
var toConsume = new BlockingCollection<Batch>(toConsumeBound);
559559
var batchColumnPool = new MadeObjectPool<BatchColumn[]>(() => new BatchColumn[inPipes.Length]);

src/Microsoft.ML.Data/DataLoadSave/Binary/Codecs.cs

+3-1
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,9 @@ private sealed class UnsafeTypeCodec<T> : SimpleCodec<T> where T : struct
159159
// Throws an exception if T is neither a TimeSpan nor a NumberType.
160160
private static DataViewType UnsafeColumnType(Type type)
161161
{
162-
return type == typeof(TimeSpan) ? (DataViewType)TimeSpanDataViewType.Instance : ColumnTypeExtensions.NumberTypeFromType(type);
162+
return type == typeof(TimeSpan) ? TimeSpanDataViewType.Instance :
163+
type == typeof(DataViewRowId) ? (DataViewType)RowIdDataViewType.Instance :
164+
ColumnTypeExtensions.NumberTypeFromType(type);
163165
}
164166

165167
public UnsafeTypeCodec(CodecFactory factory)

src/Microsoft.ML.Data/Transforms/Hashing.cs

+3-2
Original file line numberDiff line numberDiff line change
@@ -1213,10 +1213,11 @@ internal void Save(ModelSaveContext ctx)
12131213
internal static bool IsColumnTypeValid(DataViewType type)
12141214
{
12151215
var itemType = type.GetItemType();
1216-
return itemType is TextDataViewType || itemType is KeyType || itemType is NumberDataViewType || itemType is BooleanDataViewType;
1216+
return itemType is TextDataViewType || itemType is KeyType || itemType is NumberDataViewType ||
1217+
itemType is BooleanDataViewType || itemType is RowIdDataViewType;
12171218
}
12181219

1219-
internal const string ExpectedColumnType = "Expected Text, Key, numeric or Boolean item type";
1220+
internal const string ExpectedColumnType = "Expected Text, Key, numeric, Boolean or DataViewRowId item type";
12201221

12211222
/// <summary>
12221223
/// Initializes a new instance of <see cref="HashingEstimator"/>.

src/Microsoft.ML.Data/Transforms/RowShufflingTransformer.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -529,7 +529,7 @@ public Cursor(IChannelProvider provider, int poolRows, DataViewRowCursor input,
529529
input.Schema[c].Type, RowCursorUtils.GetGetterAsDelegate(input, c));
530530
_getters[ia] = CreateGetterDelegate(c);
531531
}
532-
var idPipe = _pipes[numActive + (int)ExtraIndex.Id] = ShufflePipe.Create(_pipeIndices.Length, NumberDataViewType.DataViewRowId, input.GetIdGetter());
532+
var idPipe = _pipes[numActive + (int)ExtraIndex.Id] = ShufflePipe.Create(_pipeIndices.Length, RowIdDataViewType.Instance, input.GetIdGetter());
533533
_idGetter = CreateGetterDelegate<DataViewRowId>(idPipe);
534534
// Initially, after the preamble to MoveNextCore, we want:
535535
// liveCount=0, deadCount=0, circularIndex=0. So we set these

src/Microsoft.ML.Parquet/ParquetLoader.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -350,7 +350,7 @@ private DataViewType ConvertFieldType(DataType parquetType)
350350
case DataType.Int64:
351351
return NumberDataViewType.Int64;
352352
case DataType.Int96:
353-
return NumberDataViewType.DataViewRowId;
353+
return RowIdDataViewType.Instance;
354354
case DataType.ByteArray:
355355
return new VectorType(NumberDataViewType.Byte);
356356
case DataType.String:

src/Microsoft.ML.StaticPipe/StaticSchemaShape.cs

+2-2
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ private static Type GetTypeOrNull(SchemaShape.Column col)
166166
if (physType != null && (
167167
pt == NumberDataViewType.SByte || pt == NumberDataViewType.Int16 || pt == NumberDataViewType.Int32 || pt == NumberDataViewType.Int32 ||
168168
pt == NumberDataViewType.Byte || pt == NumberDataViewType.UInt16 || pt == NumberDataViewType.UInt32 || pt == NumberDataViewType.UInt32 ||
169-
pt == NumberDataViewType.Single || pt == NumberDataViewType.Double || pt == NumberDataViewType.DataViewRowId || pt == BooleanDataViewType.Instance ||
169+
pt == NumberDataViewType.Single || pt == NumberDataViewType.Double || pt == RowIdDataViewType.Instance || pt == BooleanDataViewType.Instance ||
170170
pt == DateTimeDataViewType.Instance || pt == DateTimeOffsetDataViewType.Instance || pt == TimeSpanDataViewType.Instance ||
171171
pt == TextDataViewType.Instance))
172172
{
@@ -311,7 +311,7 @@ private static Type GetTypeOrNull(DataViewSchema.Column col)
311311
if (physType != null && (
312312
pt == NumberDataViewType.SByte || pt == NumberDataViewType.Int16 || pt == NumberDataViewType.Int32 || pt == NumberDataViewType.Int64 ||
313313
pt == NumberDataViewType.Byte || pt == NumberDataViewType.UInt16 || pt == NumberDataViewType.UInt32 || pt == NumberDataViewType.UInt64 ||
314-
pt == NumberDataViewType.Single || pt == NumberDataViewType.Double || pt == NumberDataViewType.DataViewRowId || pt == BooleanDataViewType.Instance ||
314+
pt == NumberDataViewType.Single || pt == NumberDataViewType.Double || pt == RowIdDataViewType.Instance || pt == BooleanDataViewType.Instance ||
315315
pt == DateTimeDataViewType.Instance || pt == DateTimeOffsetDataViewType.Instance || pt == TimeSpanDataViewType.Instance ||
316316
pt == TextDataViewType.Instance))
317317
{

0 commit comments

Comments
 (0)