Skip to content

Commit 3f455e2

Browse files
committed
Introduce ReadOnlyVBuffer and use it all over.
1 parent bee7f17 commit 3f455e2

33 files changed

+558
-380
lines changed

src/Microsoft.ML.Core/Data/VBuffer.cs

+156-2
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,155 @@
88

99
namespace Microsoft.ML.Runtime.Data
1010
{
11+
public readonly struct ReadOnlyVBuffer<T>
12+
{
13+
private readonly T[] _values;
14+
private readonly int[] _indices;
15+
16+
/// <summary>
17+
/// The logical length of the buffer.
18+
/// </summary>
19+
public readonly int Length;
20+
21+
/// <summary>
22+
/// The number of items explicitly represented. This is == Length when the representation
23+
/// is dense and &lt; Length when sparse.
24+
/// </summary>
25+
public readonly int Count;
26+
27+
/// <summary>
28+
/// The values. Only the first Count of these are valid.
29+
/// </summary>
30+
public ReadOnlySpan<T> Values => _values;
31+
32+
/// <summary>
33+
/// The indices. For a dense representation, Indices is not used. For a sparse representation
34+
/// it is parallel to values and specifies the logical indices for the corresponding values.
35+
/// </summary>
36+
public ReadOnlySpan<int> Indices => _indices;
37+
38+
/// <summary>
39+
/// Equivalent to Count == Length.
40+
/// </summary>
41+
public bool IsDense => Count == Length;
42+
43+
internal ReadOnlyVBuffer(int logicalLength, int valuesCount, T[] values, int[] indices)
44+
{
45+
Contracts.Assert(logicalLength >= 0);
46+
Contracts.Assert(0 <= valuesCount && valuesCount <= logicalLength);
47+
Contracts.Assert(Utils.Size(values) >= valuesCount);
48+
Contracts.Assert(valuesCount == logicalLength || Utils.Size(indices) >= valuesCount);
49+
50+
Length = logicalLength;
51+
Count = valuesCount;
52+
_values = values;
53+
_indices = indices;
54+
}
55+
56+
// TODO: remove - this is just here to make it easier for now
57+
public void GetItemOrDefault(int slot, ref T dst)
58+
{
59+
Contracts.CheckParam(0 <= slot && slot < Length, nameof(slot));
60+
61+
int index;
62+
if (IsDense)
63+
dst = Values[slot];
64+
else if (Count > 0 && Utils.TryFindIndexSorted(Indices, 0, Count, slot, out index))
65+
dst = Values[index];
66+
else
67+
dst = default;
68+
}
69+
70+
/// <summary>
71+
/// Copy from this buffer to the given destination.
72+
/// </summary>
73+
public void CopyTo(ref VBuffer<T> dst)
74+
{
75+
var values = dst.Values;
76+
var indices = dst.Indices;
77+
if (IsDense)
78+
{
79+
if (Length > 0)
80+
{
81+
if (Utils.Size(values) < Length)
82+
values = new T[Length];
83+
Values.Slice(0, Length).CopyTo(values);
84+
}
85+
dst = new VBuffer<T>(Length, values, indices);
86+
Contracts.Assert(dst.IsDense);
87+
}
88+
else
89+
{
90+
if (Count > 0)
91+
{
92+
if (Utils.Size(values) < Count)
93+
values = new T[Count];
94+
if (Utils.Size(indices) < Count)
95+
indices = new int[Count];
96+
Values.Slice(0, Count).CopyTo(values);
97+
Indices.Slice(0, Count).CopyTo(indices);
98+
}
99+
dst = new VBuffer<T>(Length, Count, values, indices);
100+
}
101+
}
102+
103+
/// <summary>
104+
/// Copy from this buffer to the given destination, forcing a dense representation.
105+
/// </summary>
106+
public void CopyToDense(ref VBuffer<T> dst)
107+
{
108+
var values = dst.Values;
109+
if (Utils.Size(values) < Length)
110+
values = new T[Length];
111+
112+
if (!IsDense)
113+
CopyTo(values);
114+
else if (Length > 0)
115+
Values.Slice(0, Length).CopyTo(values);
116+
dst = new VBuffer<T>(Length, values, dst.Indices);
117+
}
118+
119+
/// <summary>
120+
/// Copy from this buffer to the given destination array. This "densifies".
121+
/// </summary>
122+
public void CopyTo(T[] dst)
123+
{
124+
CopyTo(dst, 0);
125+
}
126+
127+
public void CopyTo(T[] dst, int ivDst, T defaultValue = default(T))
128+
{
129+
Contracts.CheckParam(0 <= ivDst && ivDst <= Utils.Size(dst) - Length, nameof(dst), "dst is not large enough");
130+
131+
if (Length == 0)
132+
return;
133+
if (IsDense)
134+
{
135+
Values.Slice(0, Length).CopyTo(dst.AsSpan(ivDst));
136+
return;
137+
}
138+
139+
if (Count == 0)
140+
{
141+
Array.Clear(dst, ivDst, Length);
142+
return;
143+
}
144+
145+
int iv = 0;
146+
for (int islot = 0; islot < Count; islot++)
147+
{
148+
int slot = Indices[islot];
149+
Contracts.Assert(slot >= iv);
150+
while (iv < slot)
151+
dst[ivDst + iv++] = defaultValue;
152+
Contracts.Assert(iv == slot);
153+
dst[ivDst + iv++] = Values[islot];
154+
}
155+
while (iv < Length)
156+
dst[ivDst + iv++] = defaultValue;
157+
}
158+
}
159+
11160
/// <summary>
12161
/// A buffer that supports both dense and sparse representations. This is the
13162
/// representation type for all VectorType instances. When an instance of this
@@ -446,7 +595,7 @@ public void GetItemOrDefault(int slot, ref T dst)
446595
int index;
447596
if (IsDense)
448597
dst = Values[slot];
449-
else if (Count > 0 && Indices.TryFindIndexSorted(0, Count, slot, out index))
598+
else if (Count > 0 && Utils.TryFindIndexSorted(Indices, 0, Count, slot, out index))
450599
dst = Values[index];
451600
else
452601
dst = default(T);
@@ -459,9 +608,14 @@ public T GetItemOrDefault(int slot)
459608
int index;
460609
if (IsDense)
461610
return Values[slot];
462-
if (Count > 0 && Indices.TryFindIndexSorted(0, Count, slot, out index))
611+
if (Count > 0 && Utils.TryFindIndexSorted(Indices, 0, Count, slot, out index))
463612
return Values[index];
464613
return default(T);
465614
}
615+
616+
public static implicit operator ReadOnlyVBuffer<T>(VBuffer<T> buffer)
617+
{
618+
return new ReadOnlyVBuffer<T>(buffer.Length, buffer.Count, buffer.Values, buffer.Indices);
619+
}
466620
}
467621
}

src/Microsoft.ML.Core/Utilities/FloatUtils.cs

+5-3
Original file line numberDiff line numberDiff line change
@@ -456,12 +456,14 @@ public static bool IsFinite(Double[] values, int count)
456456
// REVIEW: Consider implementing using SSE.
457457
public static bool IsFinite(Single[] values, int count)
458458
{
459-
Contracts.Assert(count >= 0);
460-
Contracts.Assert(Utils.Size(values) >= count);
459+
return IsFinite(values.AsSpan(0, count));
460+
}
461461

462+
public static bool IsFinite(ReadOnlySpan<Single> values)
463+
{
462464
// Assuming that non-finites are rare, this is faster than testing on each item.
463465
Single sum = 0;
464-
for (int i = 0; i < count; i++)
466+
for (int i = 0; i < values.Length; i++)
465467
{
466468
var v = values[i];
467469
sum += v - v;

src/Microsoft.ML.Core/Utilities/Utils.cs

+25-7
Original file line numberDiff line numberDiff line change
@@ -186,9 +186,8 @@ public static void Push<T>(ref Stack<T> stack, T item)
186186
/// In case of duplicates it returns the index of the first one.
187187
/// It guarantees that items before the returned index are &lt; value, while those at and after the returned index are &gt;= value.
188188
/// </summary>
189-
public static int FindIndexSorted(this int[] input, int value)
189+
public static int FindIndexSorted(this ReadOnlySpan<int> input, int value)
190190
{
191-
Contracts.AssertValue(input);
192191
return FindIndexSorted(input, 0, input.Length, value);
193192
}
194193

@@ -233,9 +232,9 @@ public static int FindIndexSorted(this Double[] input, Double value)
233232
/// <c>index</c> parameter, and returns whether that index is a valid index
234233
/// pointing to a value equal to the input parameter <c>value</c>.
235234
/// </summary>
236-
public static bool TryFindIndexSorted(this int[] input, int min, int lim, int value, out int index)
235+
public static bool TryFindIndexSorted(ReadOnlySpan<int> input, int min, int lim, int value, out int index)
237236
{
238-
index = input.FindIndexSorted(min, lim, value);
237+
index = FindIndexSorted(input, min, lim, value);
239238
return index < lim && input[index] == value;
240239
}
241240

@@ -245,10 +244,14 @@ public static bool TryFindIndexSorted(this int[] input, int min, int lim, int va
245244
/// In case of duplicates it returns the index of the first one.
246245
/// It guarantees that items before the returned index are &lt; value, while those at and after the returned index are &gt;= value.
247246
/// </summary>
248-
public static int FindIndexSorted(this int[] input, int min, int lim, int value)
247+
public static int FindIndexSorted(int[] input, int min, int lim, int value)
249248
{
250-
Contracts.AssertValueOrNull(input);
251-
Contracts.Assert(0 <= min & min <= lim & lim <= Utils.Size(input));
249+
return FindIndexSorted(input.AsSpan(), min, lim, value);
250+
}
251+
252+
public static int FindIndexSorted(ReadOnlySpan<int> input, int min, int lim, int value)
253+
{
254+
Contracts.Assert(0 <= min & min <= lim & lim <= input.Length);
252255

253256
int minCur = min;
254257
int limCur = lim;
@@ -1087,5 +1090,20 @@ public static string GetDescription(this Enum value)
10871090
}
10881091
return null;
10891092
}
1093+
1094+
public static int Count<TSource>(this ReadOnlySpan<TSource> source, Func<TSource, bool> predicate)
1095+
{
1096+
Contracts.CheckValue(predicate, nameof(predicate));
1097+
1098+
int result = 0;
1099+
for (int i = 0; i < source.Length; i++)
1100+
{
1101+
if (predicate(source[i]))
1102+
{
1103+
result++;
1104+
}
1105+
}
1106+
return result;
1107+
}
10901108
}
10911109
}

src/Microsoft.ML.Core/Utilities/VBufferUtils.cs

+15-15
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ public static VBuffer<T> CreateDense<T>(int length)
147147
/// Applies <paramref name="visitor"/> to every explicitly defined element of the vector,
148148
/// in order of index.
149149
/// </summary>
150-
public static void ForEachDefined<T>(ref VBuffer<T> a, Action<int, T> visitor)
150+
public static void ForEachDefined<T>(in ReadOnlyVBuffer<T> a, Action<int, T> visitor)
151151
{
152152
Contracts.CheckValue(visitor, nameof(visitor));
153153

@@ -573,9 +573,9 @@ public static void CreateMaybeSparseCopy<T>(ref VBuffer<T> src, ref VBuffer<T> d
573573
/// <param name="src">Argument vector, whose elements are only read</param>
574574
/// <param name="dst">Argument vector, that could change</param>
575575
/// <param name="manip">Function to apply to each pair of elements</param>
576-
public static void ApplyWith<TSrc, TDst>(ref VBuffer<TSrc> src, ref VBuffer<TDst> dst, PairManipulator<TSrc, TDst> manip)
576+
public static void ApplyWith<TSrc, TDst>(in ReadOnlyVBuffer<TSrc> src, ref VBuffer<TDst> dst, PairManipulator<TSrc, TDst> manip)
577577
{
578-
ApplyWithCore(ref src, ref dst, manip, outer: false);
578+
ApplyWithCore(in src, ref dst, manip, outer: false);
579579
}
580580

581581
/// <summary>
@@ -608,9 +608,9 @@ public static void ApplyWithCopy<TSrc, TDst>(ref VBuffer<TSrc> src, ref VBuffer<
608608
/// <param name="src">Argument vector, whose elements are only read</param>
609609
/// <param name="dst">Argument vector, that could change</param>
610610
/// <param name="manip">Function to apply to each pair of elements</param>
611-
public static void ApplyWithEitherDefined<TSrc, TDst>(ref VBuffer<TSrc> src, ref VBuffer<TDst> dst, PairManipulator<TSrc, TDst> manip)
611+
public static void ApplyWithEitherDefined<TSrc, TDst>(in ReadOnlyVBuffer<TSrc> src, ref VBuffer<TDst> dst, PairManipulator<TSrc, TDst> manip)
612612
{
613-
ApplyWithCore(ref src, ref dst, manip, outer: true);
613+
ApplyWithCore(in src, ref dst, manip, outer: true);
614614
}
615615

616616
/// <summary>
@@ -636,7 +636,7 @@ public static void ApplyWithEitherDefinedCopy<TSrc, TDst>(ref VBuffer<TSrc> src,
636636
/// where necessary depending on whether this is an inner or outer join of the
637637
/// indices of <paramref name="src"/> on <paramref name="dst"/>.
638638
/// </summary>
639-
private static void ApplyWithCore<TSrc, TDst>(ref VBuffer<TSrc> src, ref VBuffer<TDst> dst, PairManipulator<TSrc, TDst> manip, bool outer)
639+
private static void ApplyWithCore<TSrc, TDst>(in ReadOnlyVBuffer<TSrc> src, ref VBuffer<TDst> dst, PairManipulator<TSrc, TDst> manip, bool outer)
640640
{
641641
Contracts.Check(src.Length == dst.Length, "Vectors must have the same dimensionality.");
642642
Contracts.CheckValue(manip, nameof(manip));
@@ -773,7 +773,7 @@ private static void ApplyWithCore<TSrc, TDst>(ref VBuffer<TSrc> src, ref VBuffer
773773
// This is unnecessary -- falling through to the sparse code will
774774
// actually handle this case just fine -- but it is more efficient.
775775
Densify(ref dst);
776-
ApplyWithCore(ref src, ref dst, manip, outer);
776+
ApplyWithCore(src, ref dst, manip, outer);
777777
return;
778778
}
779779

@@ -892,7 +892,7 @@ private static void ApplyWithCore<TSrc, TDst>(ref VBuffer<TSrc> src, ref VBuffer
892892
Densify(ref dst);
893893
int[] indices = dst.Indices;
894894
Utils.EnsureSize(ref indices, src.Count, src.Length, keepOld: false);
895-
Array.Copy(src.Indices, indices, newCount);
895+
src.Indices.CopyTo(indices);
896896
dst = new VBuffer<TDst>(src.Length, newCount, dst.Values, indices);
897897
for (sI = 0; sI < src.Count; sI++)
898898
manip(src.Indices[sI], src.Values[sI], ref dst.Values[sI]);
@@ -1152,15 +1152,15 @@ private static void ApplyWithCoreCopy<TSrc, TDst>(ref VBuffer<TSrc> src, ref VBu
11521152
/// </summary>
11531153
/// <seealso cref="ApplyWith{TSrc,TDst}"/>
11541154
/// <seealso cref="ApplyWithEitherDefined{TSrc,TDst}"/>
1155-
public static void ApplyIntoEitherDefined<TSrc, TDst>(ref VBuffer<TSrc> src, ref VBuffer<TDst> dst, Func<int, TSrc, TDst> func)
1155+
public static void ApplyIntoEitherDefined<TSrc, TDst>(in ReadOnlyVBuffer<TSrc> src, ref VBuffer<TDst> dst, Func<int, TSrc, TDst> func)
11561156
{
11571157
Contracts.CheckValue(func, nameof(func));
11581158

11591159
// REVIEW: The analogous WritableVector method insisted on
11601160
// equal lengths, but I don't care here.
11611161
if (src.Count == 0)
11621162
{
1163-
dst = new VBuffer<TDst>(src.Length, src.Count, dst.Values, dst.Indices);
1163+
dst = new VBuffer<TDst>(src.Length, 0, dst.Values, dst.Indices);
11641164
return;
11651165
}
11661166
int[] indices = dst.Indices;
@@ -1174,7 +1174,7 @@ public static void ApplyIntoEitherDefined<TSrc, TDst>(ref VBuffer<TSrc> src, ref
11741174
else
11751175
{
11761176
Utils.EnsureSize(ref indices, src.Count, src.Length, keepOld: false);
1177-
Array.Copy(src.Indices, indices, src.Count);
1177+
src.Indices.CopyTo(indices);
11781178
for (int i = 0; i < src.Count; ++i)
11791179
values[i] = func(src.Indices[i], src.Values[i]);
11801180
}
@@ -1189,7 +1189,7 @@ public static void ApplyIntoEitherDefined<TSrc, TDst>(ref VBuffer<TSrc> src, ref
11891189
/// necessarily be dense. Otherwise, if both are sparse, the output will be sparse iff
11901190
/// there is any slot that is not explicitly represented in either vector.
11911191
/// </summary>
1192-
public static void ApplyInto<TSrc1, TSrc2, TDst>(ref VBuffer<TSrc1> a, ref VBuffer<TSrc2> b, ref VBuffer<TDst> dst, Func<int, TSrc1, TSrc2, TDst> func)
1192+
public static void ApplyInto<TSrc1, TSrc2, TDst>(in ReadOnlyVBuffer<TSrc1> a, in ReadOnlyVBuffer<TSrc2> b, ref VBuffer<TDst> dst, Func<int, TSrc1, TSrc2, TDst> func)
11931193
{
11941194
Contracts.Check(a.Length == b.Length, "Vectors must have the same dimensionality.");
11951195
Contracts.CheckValue(func, nameof(func));
@@ -1277,7 +1277,7 @@ public static void ApplyInto<TSrc1, TSrc2, TDst>(ref VBuffer<TSrc1> a, ref VBuff
12771277
if (newCount == a.Count)
12781278
{
12791279
// Case 3, a and b actually have the same indices!
1280-
Array.Copy(a.Indices, indices, a.Count);
1280+
a.Indices.CopyTo(indices);
12811281
for (aI = 0; aI < a.Count; aI++)
12821282
{
12831283
Contracts.Assert(a.Indices[aI] == b.Indices[aI]);
@@ -1287,7 +1287,7 @@ public static void ApplyInto<TSrc1, TSrc2, TDst>(ref VBuffer<TSrc1> a, ref VBuff
12871287
else
12881288
{
12891289
// Case 4, a's indices are a subset of b's.
1290-
Array.Copy(b.Indices, indices, b.Count);
1290+
b.Indices.CopyTo(indices);
12911291
aI = 0;
12921292
for (bI = 0; aI < a.Count && bI < b.Count; bI++)
12931293
{
@@ -1302,7 +1302,7 @@ public static void ApplyInto<TSrc1, TSrc2, TDst>(ref VBuffer<TSrc1> a, ref VBuff
13021302
else if (newCount == a.Count)
13031303
{
13041304
// Case 5, b's indices are a subset of a's.
1305-
Array.Copy(a.Indices, indices, a.Count);
1305+
a.Indices.CopyTo(indices);
13061306
bI = 0;
13071307
for (aI = 0; bI < b.Count && aI < a.Count; aI++)
13081308
{

src/Microsoft.ML.Data/DataView/CompositeSchema.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ public void CheckColumnInRange(int col)
6464
public void GetColumnSource(int col, out int srcIndex, out int srcCol)
6565
{
6666
CheckColumnInRange(col);
67-
if (!_cumulativeColCounts.TryFindIndexSorted(0, _cumulativeColCounts.Length, col, out srcIndex))
67+
if (!Utils.TryFindIndexSorted(_cumulativeColCounts, 0, _cumulativeColCounts.Length, col, out srcIndex))
6868
srcIndex--;
6969
Contracts.Assert(0 <= srcIndex && srcIndex < _cumulativeColCounts.Length);
7070
srcCol = col - _cumulativeColCounts[srcIndex];

0 commit comments

Comments
 (0)