Skip to content

Commit cc70940

Browse files
author
Ivan Matantsev
committed
cat hash
1 parent 5874e16 commit cc70940

File tree

7 files changed

+594
-132
lines changed

7 files changed

+594
-132
lines changed

src/Microsoft.ML.Data/Transforms/HashTransform.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ public bool TryUnparse(StringBuilder sb)
116116
}
117117
}
118118

119-
public sealed class ColumnInfo
119+
public class ColumnInfo
120120
{
121121
public readonly string Input;
122122
public readonly string Output;

src/Microsoft.ML.Transforms/CategoricalHashTransform.cs

Lines changed: 247 additions & 98 deletions
Large diffs are not rendered by default.

src/Microsoft.ML.Transforms/CategoricalTransform.cs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -454,13 +454,13 @@ public override IEstimator<ITransformer> Reconcile(IHostEnvironment env, Pipelin
454454
}
455455

456456
/// <summary>
457-
/// Converts the categorical value into an indicator array by building a dictionary of categories based on the data and using the id in the dictionary as the index in the array
457+
/// Converts the categorical value into an indicator array by building a dictionary of categories based on the data and using the id in the dictionary as the index in the array.
458458
/// </summary>
459459
/// <param name="input">Incoming data.</param>
460460
/// <param name="outputKind">Specify output type of indicator array: array or binary encoded data.</param>
461461
/// <param name="order">How Id for each value would be assigined: by occurrence or by value.</param>
462462
/// <param name="maxItems">Maximum number of ids to keep during data scanning.</param>
463-
/// /// <param name="onFit">Called upon fitting with the learnt enumeration on the dataset.</param>
463+
/// <param name="onFit">Called upon fitting with the learnt enumeration on the dataset.</param>
464464
public static Vector<float> OneHotEncoding(this Scalar<string> input, OneHotScalarOutputKind outputKind = (OneHotScalarOutputKind)DefOut, KeyValueOrder order = DefSort,
465465
int maxItems = DefMax, ToKeyFitResult<ReadOnlyMemory<char>>.OnFit onFit = null)
466466
{
@@ -469,7 +469,7 @@ public static Vector<float> OneHotEncoding(this Scalar<string> input, OneHotScal
469469
}
470470

471471
/// <summary>
472-
/// Converts the categorical value into an indicator array by building a dictionary of categories based on the data and using the id in the dictionary as the index in the array
472+
/// Converts the categorical value into an indicator array by building a dictionary of categories based on the data and using the id in the dictionary as the index in the array.
473473
/// </summary>
474474
/// <param name="input">Incoming data.</param>
475475
/// <param name="outputKind">Specify output type of indicator array: Multiarray, array or binary encoded data.</param>
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#@ TextLoader{
2+
#@ sep=tab
3+
#@ col=A:R4:0-17
4+
#@ col=B:R4:18-89
5+
#@ col=C:R4:90-161
6+
#@ col=D:R4:162-179
7+
#@ col=E:R4:180-251
8+
#@ }
9+
252 4:1 6:1 7:1 9:1 14:1 17:1 22:1 24:1 25:1 27:1 32:1 35:1 38:1 43:1 44:1 45:1 46:1 47:1 48:1 49:1 53:1 64:1 66:1 68:1 71:1 74:1 75:1 76:1 77:1 78:1 80:1 81:1 82:1 84:1 87:1 94:1 96:1 97:1 99:1 104:1 107:1 110:1 115:1 116:1 117:1 118:1 119:1 120:1 121:1 125:1 136:1 138:1 140:1 143:1 146:1 147:1 148:1 149:1 150:1 152:1 153:1 154:1 156:1 159:1 166:1 168:1 169:1 171:1 176:1 179:1 184:1 186:1 187:1 189:1 194:1 197:1 200:1 205:1 206:1 207:1 208:1 209:1 210:1 211:1 215:1 226:1 228:1 230:1 233:1 236:1 237:1 238:1 239:1 240:1 242:1 243:1 244:1 246:1 249:1
10+
0 0 0 0 1 0 1 1 0 1 0 0 0 0 1 0 0 1 0 0 0 0 1 0 1 1 0 1 0 0 0 0 1 0 0 1 0 0 1 1 1 1 0 0 0 0 0 1 1 0 1 1 0 1 0 0 1 0 0 1 1 0 1 0 0 0 0 1 1 1 0 0 0 0 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 0 0 0 0 1 0 1 1 0 1 0 0 0 0 1 0 0 1 0 0 1 1 1 1 0 0 0 0 0 1 1 0 1 1 0 1 0 0 1 0 0 1 1 0 1 0 0 0 0 1 1 1 0 0 0 0 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 0 0 0 0 1 0 1 1 0 1 0 0 0 0 1 0 0 1 0 0 0 0 1 0 1 1 0 1 0 0 0 0 1 0 0 1 0 0 1 1 1 1 0 0 0 0 0 1 1 0 1 1 0 1 0 0 1 0 0 1 1 0 1 0 0 0 0 1 1 1 0 0 0 0 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1
11+
0 0 1 0 1 1 1 0 0 1 0 1 1 1 1 0 1 1 0 0 1 0 1 1 1 0 0 1 0 1 1 1 1 0 1 1 0 0 1 0 0 0 0 1 1 1 1 1 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 0 1 1 1 1 1 0 1 1 1 0 1 0 0 1 0 0 0 0 1 0 1 1 1 0 0 1 0 1 1 1 1 0 1 1 0 0 1 0 0 0 0 1 1 1 1 1 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 0 1 1 1 1 1 0 1 1 1 0 1 0 0 1 0 0 0 0 1 0 1 1 1 0 0 1 0 1 1 1 1 0 1 1 0 0 1 0 1 1 1 0 0 1 0 1 1 1 1 0 1 1 0 0 1 0 0 0 0 1 1 1 1 1 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 0 1 1 1 1 1 0 1 1 1 0 1 0 0 1 0 0
12+
0 0 1 0 1 0 0 1 1 0 0 1 0 1 1 1 0 0 0 0 1 0 1 0 0 1 1 0 0 1 0 1 1 1 0 0 0 0 1 0 1 0 0 1 1 1 0 1 1 0 0 0 0 1 0 0 0 0 1 1 1 1 0 1 1 1 0 1 1 1 1 0 0 0 1 1 1 1 1 0 1 1 1 0 1 0 0 1 0 0 0 0 1 0 1 0 0 1 1 0 0 1 0 1 1 1 0 0 0 0 1 0 1 0 0 1 1 1 0 1 1 0 0 0 0 1 0 0 0 0 1 1 1 1 0 1 1 1 0 1 1 1 1 0 0 0 1 1 1 1 1 0 1 1 1 0 1 0 0 1 0 0 0 0 1 0 1 0 0 1 1 0 0 1 0 1 1 1 0 0 0 0 1 0 1 0 0 1 1 0 0 1 0 1 1 1 0 0 0 0 1 0 1 0 0 1 1 1 0 1 1 0 0 0 0 1 0 0 0 0 1 1 1 1 0 1 1 1 0 1 1 1 1 0 0 0 1 1 1 1 1 0 1 1 1 0 1 0 0 1 0 0
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#@ TextLoader{
2+
#@ sep=tab
3+
#@ col=A:R4:0-17
4+
#@ col=B:R4:18-89
5+
#@ col=C:R4:90-161
6+
#@ col=D:R4:162-179
7+
#@ col=E:R4:180-251
8+
#@ }
9+
252 4:1 6:1 7:1 9:1 14:1 17:1 22:1 24:1 25:1 27:1 32:1 35:1 38:1 43:1 44:1 45:1 46:1 47:1 48:1 49:1 53:1 64:1 66:1 68:1 71:1 74:1 75:1 76:1 77:1 78:1 80:1 81:1 82:1 84:1 87:1 94:1 96:1 97:1 99:1 104:1 107:1 110:1 115:1 116:1 117:1 118:1 119:1 120:1 121:1 125:1 136:1 138:1 140:1 143:1 146:1 147:1 148:1 149:1 150:1 152:1 153:1 154:1 156:1 159:1 166:1 168:1 169:1 171:1 176:1 179:1 184:1 186:1 187:1 189:1 194:1 197:1 200:1 205:1 206:1 207:1 208:1 209:1 210:1 211:1 215:1 226:1 228:1 230:1 233:1 236:1 237:1 238:1 239:1 240:1 242:1 243:1 244:1 246:1 249:1
10+
0 0 0 0 1 0 1 1 0 1 0 0 0 0 1 0 0 1 0 0 0 0 1 0 1 1 0 1 0 0 0 0 1 0 0 1 0 0 1 1 1 1 0 0 0 0 0 1 1 0 1 1 0 1 0 0 1 0 0 1 1 0 1 0 0 0 0 1 1 1 0 0 0 0 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 0 0 0 0 1 0 1 1 0 1 0 0 0 0 1 0 0 1 0 0 1 1 1 1 0 0 0 0 0 1 1 0 1 1 0 1 0 0 1 0 0 1 1 0 1 0 0 0 0 1 1 1 0 0 0 0 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 0 0 0 0 1 0 1 1 0 1 0 0 0 0 1 0 0 1 0 0 0 0 1 0 1 1 0 1 0 0 0 0 1 0 0 1 0 0 1 1 1 1 0 0 0 0 0 1 1 0 1 1 0 1 0 0 1 0 0 1 1 0 1 0 0 0 0 1 1 1 0 0 0 0 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1
11+
0 0 1 0 1 1 1 0 0 1 0 1 1 1 1 0 1 1 0 0 1 0 1 1 1 0 0 1 0 1 1 1 1 0 1 1 0 0 1 0 0 0 0 1 1 1 1 1 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 0 1 1 1 1 1 0 1 1 1 0 1 0 0 1 0 0 0 0 1 0 1 1 1 0 0 1 0 1 1 1 1 0 1 1 0 0 1 0 0 0 0 1 1 1 1 1 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 0 1 1 1 1 1 0 1 1 1 0 1 0 0 1 0 0 0 0 1 0 1 1 1 0 0 1 0 1 1 1 1 0 1 1 0 0 1 0 1 1 1 0 0 1 0 1 1 1 1 0 1 1 0 0 1 0 0 0 0 1 1 1 1 1 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 0 1 1 1 1 1 0 1 1 1 0 1 0 0 1 0 0
12+
0 0 1 0 1 0 0 1 1 0 0 1 0 1 1 1 0 0 0 0 1 0 1 0 0 1 1 0 0 1 0 1 1 1 0 0 0 0 1 0 1 0 0 1 1 1 0 1 1 0 0 0 0 1 0 0 0 0 1 1 1 1 0 1 1 1 0 1 1 1 1 0 0 0 1 1 1 1 1 0 1 1 1 0 1 0 0 1 0 0 0 0 1 0 1 0 0 1 1 0 0 1 0 1 1 1 0 0 0 0 1 0 1 0 0 1 1 1 0 1 1 0 0 0 0 1 0 0 0 0 1 1 1 1 0 1 1 1 0 1 1 1 1 0 0 0 1 1 1 1 1 0 1 1 1 0 1 0 0 1 0 0 0 0 1 0 1 0 0 1 1 0 0 1 0 1 1 1 0 0 0 0 1 0 1 0 0 1 1 0 0 1 0 1 1 1 0 0 0 0 1 0 1 0 0 1 1 1 0 1 1 0 0 0 0 1 0 0 0 0 1 1 1 1 0 1 1 1 0 1 1 1 1 0 0 0 1 1 1 1 1 0 1 1 1 0 1 0 0 1 0 0

0 commit comments

Comments
 (0)