diff --git a/src/Microsoft.ML.Data/Prediction/Calibrator.cs b/src/Microsoft.ML.Data/Prediction/Calibrator.cs index 6e08ee38f4..885ab84d9a 100644 --- a/src/Microsoft.ML.Data/Prediction/Calibrator.cs +++ b/src/Microsoft.ML.Data/Prediction/Calibrator.cs @@ -1110,20 +1110,22 @@ private static VersionInfo GetVersionInfo() public readonly float Min; /// The value of probability in each bin. - public readonly float[] BinProbs; + public IReadOnlyList BinProbs => _binProbs; + + private readonly float[] _binProbs; /// Initializes a new instance of . /// The to use. /// The minimum value in the first bin. /// The values of the probability in each bin. /// The bin size. - public NaiveCalibrator(IHostEnvironment env, float min, float binSize, float[] binProbs) + internal NaiveCalibrator(IHostEnvironment env, float min, float binSize, float[] binProbs) { Contracts.CheckValue(env, nameof(env)); _host = env.Register(RegistrationName); Min = min; BinSize = binSize; - BinProbs = binProbs; + _binProbs = binProbs; } private NaiveCalibrator(IHostEnvironment env, ModelLoadContext ctx) @@ -1147,9 +1149,9 @@ private NaiveCalibrator(IHostEnvironment env, ModelLoadContext ctx) Min = ctx.Reader.ReadFloat(); _host.CheckDecode(FloatUtils.IsFinite(Min)); - BinProbs = ctx.Reader.ReadFloatArray(); - _host.CheckDecode(Utils.Size(BinProbs) > 0); - _host.CheckDecode(BinProbs.All(x => (0 <= x && x <= 1))); + _binProbs = ctx.Reader.ReadFloatArray(); + _host.CheckDecode(Utils.Size(_binProbs) > 0); + _host.CheckDecode(_binProbs.All(x => (0 <= x && x <= 1))); } private static NaiveCalibrator Create(IHostEnvironment env, ModelLoadContext ctx) @@ -1180,7 +1182,7 @@ private void SaveCore(ModelSaveContext ctx) ctx.Writer.Write(sizeof(float)); ctx.Writer.Write(BinSize); ctx.Writer.Write(Min); - ctx.Writer.WriteSingleArray(BinProbs); + ctx.Writer.WriteSingleArray(_binProbs); } /// @@ -1190,8 +1192,8 @@ public float PredictProbability(float output) { if (float.IsNaN(output)) return output; - int binIdx = GetBinIdx(output, Min, BinSize, BinProbs.Length); - return BinProbs[binIdx]; + int binIdx = GetBinIdx(output, Min, BinSize, _binProbs.Length); + return _binProbs[binIdx]; } // get the bin for a given output @@ -1205,11 +1207,6 @@ internal static int GetBinIdx(float output, float min, float binSize, int numBin return binIdx; } - /// Get the summary of current calibrator settings - public string GetSummary() - { - return string.Format("Naive Calibrator has {0} bins, starting at {1}, with bin size of {2}", BinProbs.Length, Min, BinSize); - } } /// @@ -1218,8 +1215,91 @@ public string GetSummary() [BestFriend] internal abstract class CalibratorTrainerBase : ICalibratorTrainer { + public sealed class DataStore : IEnumerable + { + public readonly struct DataItem + { + // The actual binary label of this example. + public readonly bool Target; + // The weight associated with this example. + public readonly float Weight; + // The output of the example. + public readonly float Score; + + public DataItem(bool target, float weight, float score) + { + Target = target; + Weight = weight; + Score = score; + } + } + + // REVIEW: Should probably be a long. + private int _itemsSeen; + private readonly Random _random; + + private static int _randSeed; + + private readonly int _capacity; + private DataItem[] _data; + private bool _dataSorted; + + public DataStore() + : this(1000000) + { + } + + public DataStore(int capacity) + { + Contracts.CheckParam(capacity > 0, nameof(capacity), "must be positive"); + + _capacity = capacity; + _data = new DataItem[Math.Min(4, capacity)]; + // REVIEW: Horrifying. At a point when we have the IHost stuff plumbed through + // calibrator training and also have the appetite to change a bunch of baselines, this + // should be seeded using the host random. + _random = new System.Random(System.Threading.Interlocked.Increment(ref _randSeed) - 1); + } + + /// + /// An enumerator over the entries sorted by score. + /// + /// + public IEnumerator GetEnumerator() + { + if (!_dataSorted) + { + var comp = Comparer.Create((x, y) => x.Score.CompareTo(y.Score)); + Array.Sort(_data, 0, Math.Min(_itemsSeen, _capacity), comp); + _dataSorted = true; + } + return _data.Take(_itemsSeen).GetEnumerator(); + } + + IEnumerator IEnumerable.GetEnumerator() + { + return GetEnumerator(); + } + + public void AddToStore(float score, bool isPositive, float weight) + { + // Can't calibrate NaN scores. + if (weight == 0 || float.IsNaN(score)) + return; + int index = _itemsSeen++; + if (_itemsSeen <= _capacity) + Utils.EnsureSize(ref _data, _itemsSeen, _capacity); + else + { + index = _random.Next(_itemsSeen); // 0 to items_seen - 1. + if (index >= _capacity) // Don't keep it. + return; + } + _data[index] = new DataItem(isPositive, weight, score); + } + } protected readonly IHost Host; - protected CalibrationDataStore Data; + protected DataStore Data; protected const int DefaultMaxNumSamples = 1000000; protected int MaxNumSamples; @@ -1239,7 +1319,7 @@ protected CalibratorTrainerBase(IHostEnvironment env, string name) bool ICalibratorTrainer.ProcessTrainingExample(float output, bool labelIs1, float weight) { if (Data == null) - Data = new CalibrationDataStore(MaxNumSamples); + Data = new DataStore(MaxNumSamples); Data.AddToStore(output, labelIs1, weight); return true; } @@ -1485,7 +1565,13 @@ private static VersionInfo GetVersionInfo() private readonly IHost _host; + /// + /// Slope value for this calibrator. + /// public Double Slope { get; } + /// + /// Offset value for this calibrator + /// public Double Offset { get; } bool ICanSavePfa.CanSavePfa => true; bool ICanSaveOnnx.CanSaveOnnx(OnnxContext ctx) => true; @@ -1493,7 +1579,7 @@ private static VersionInfo GetVersionInfo() /// /// Initializes a new instance of . /// - public PlattCalibrator(IHostEnvironment env, Double slope, Double offset) + internal PlattCalibrator(IHostEnvironment env, Double slope, Double offset) { Contracts.CheckValue(env, nameof(env)); _host = env.Register(RegistrationName); @@ -1556,6 +1642,7 @@ private void SaveCore(ModelSaveContext ctx) } } + /// Given a classifier output, produce the probability. public float PredictProbability(float output) { if (float.IsNaN(output)) @@ -1563,7 +1650,7 @@ public float PredictProbability(float output) return PredictProbability(output, Slope, Offset); } - public static float PredictProbability(float output, Double a, Double b) + internal static float PredictProbability(float output, Double a, Double b) { return (float)(1 / (1 + Math.Exp(a * output + b))); } @@ -1597,11 +1684,6 @@ bool ISingleCanSaveOnnx.SaveAsOnnx(OnnxContext ctx, string[] scoreProbablityColu return true; } - public string GetSummary() - { - return string.Format("Platt calibrator parameters: A={0}, B={1}", Slope, Offset); - } - IParameterMixer IParameterMixer.CombineParameters(IList calibrators) { Double a = 0; @@ -1703,12 +1785,17 @@ public override ICalibrator CreateCalibrator(IChannel ch) /// /// The pair-adjacent violators calibrator. - /// The function that is implemented by this calibrator is: - /// f(x) = v_i, if minX_i <= x <= maxX_i - /// = linear interpolate between v_i and v_i+1, if maxX_i < x < minX_i+1 - /// = v_0, if x < minX_0 - /// = v_n, if x > maxX_n /// + /// + /// The function that is implemented by this calibrator is: + /// P(x) = + /// + /// [i], if [i] <= x <= [i]> + /// Linear interpolation between [i] and [i+1], if [i] < x < [i+1] + /// [0], if x < [0] + /// [n], if x > [n] + /// + /// public sealed class PavCalibrator : ICalibrator, ICanSaveInBinaryFormat { internal const string LoaderSignature = "PAVCaliExec"; @@ -1731,8 +1818,17 @@ private static VersionInfo GetVersionInfo() private const float MaxToReturn = 1 - Epsilon; // max predicted is 1 - min; private readonly IHost _host; + /// + /// Bottom borders of PAV intervals. + /// public readonly ImmutableArray Mins; + /// + /// Upper borders of PAV intervals. + /// public readonly ImmutableArray Maxes; + /// + /// Values of PAV intervals. + /// public readonly ImmutableArray Values; /// @@ -1742,7 +1838,7 @@ private static VersionInfo GetVersionInfo() /// The minimum values for each piece. /// The maximum values for each piece. /// The actual values for each piece. - public PavCalibrator(IHostEnvironment env, ImmutableArray mins, ImmutableArray maxes, ImmutableArray values) + internal PavCalibrator(IHostEnvironment env, ImmutableArray mins, ImmutableArray maxes, ImmutableArray values) { Contracts.AssertValue(env); _host = env.Register(RegistrationName); @@ -1851,6 +1947,7 @@ private void SaveCore(ModelSaveContext ctx) _host.CheckDecode(valuePrev <= 1); } + /// Given a classifier output, produce the probability. public float PredictProbability(float output) { if (float.IsNaN(output)) @@ -1890,95 +1987,6 @@ private float FindValue(float score) float t = (score - Maxes[pos - 1]) / (Mins[pos] - Maxes[pos - 1]); return Values[pos - 1] + t * (Values[pos] - Values[pos - 1]); } - - public string GetSummary() - { - return string.Format("PAV calibrator with {0} intervals", Mins.Length); - } - } - - public sealed class CalibrationDataStore : IEnumerable - { - public readonly struct DataItem - { - // The actual binary label of this example. - public readonly bool Target; - // The weight associated with this example. - public readonly float Weight; - // The output of the example. - public readonly float Score; - - public DataItem(bool target, float weight, float score) - { - Target = target; - Weight = weight; - Score = score; - } - } - - // REVIEW: Should probably be a long. - private int _itemsSeen; - private readonly Random _random; - - private static int _randSeed; - - private readonly int _capacity; - private DataItem[] _data; - private bool _dataSorted; - - public CalibrationDataStore() - : this(1000000) - { - } - - public CalibrationDataStore(int capacity) - { - Contracts.CheckParam(capacity > 0, nameof(capacity), "must be positive"); - - _capacity = capacity; - _data = new DataItem[Math.Min(4, capacity)]; - // REVIEW: Horrifying. At a point when we have the IHost stuff plumbed through - // calibrator training and also have the appetite to change a bunch of baselines, this - // should be seeded using the host random. - _random = new System.Random(System.Threading.Interlocked.Increment(ref _randSeed) - 1); - } - - /// - /// An enumerator over the entries sorted by score. - /// - /// - public IEnumerator GetEnumerator() - { - if (!_dataSorted) - { - var comp = Comparer.Create((x, y) => x.Score.CompareTo(y.Score)); - Array.Sort(_data, 0, Math.Min(_itemsSeen, _capacity), comp); - _dataSorted = true; - } - return _data.Take(_itemsSeen).GetEnumerator(); - } - - IEnumerator IEnumerable.GetEnumerator() - { - return GetEnumerator(); - } - - public void AddToStore(float score, bool isPositive, float weight) - { - // Can't calibrate NaN scores. - if (weight == 0 || float.IsNaN(score)) - return; - int index = _itemsSeen++; - if (_itemsSeen <= _capacity) - Utils.EnsureSize(ref _data, _itemsSeen, _capacity); - else - { - index = _random.Next(_itemsSeen); // 0 to items_seen - 1. - if (index >= _capacity) // Don't keep it. - return; - } - _data[index] = new DataItem(isPositive, weight, score); - } } internal static class Calibrate diff --git a/src/Microsoft.ML.HalLearners/OlsLinearRegression.cs b/src/Microsoft.ML.HalLearners/OlsLinearRegression.cs index f352becdbb..d0e353c3c6 100644 --- a/src/Microsoft.ML.HalLearners/OlsLinearRegression.cs +++ b/src/Microsoft.ML.HalLearners/OlsLinearRegression.cs @@ -556,19 +556,25 @@ private static VersionInfo GetVersionInfo() /// and all subsequent correspond to each weight in turn. This is null if and /// only if is false. /// - public readonly IReadOnlyList StandardErrors; + public IReadOnlyList StandardErrors => _standardErrors; + + private readonly double[] _standardErrors; /// /// t-Statistic values corresponding to each of the model standard errors. This is /// null if and only if is false. /// - public readonly IReadOnlyList TValues; + public IReadOnlyList TValues => _tValues; + + private readonly double[] _tValues; /// /// p-values corresponding to each of the model standard errors. This is null /// if and only if is false. /// - public readonly IReadOnlyList PValues; + public IReadOnlyList PValues => _pValues; + + private readonly double[] _pValues; /// /// Constructs a new OLS regression model parameters from trained model. @@ -612,9 +618,9 @@ internal OlsLinearRegressionModelParameters(IHostEnvironment env, in VBuffer= 0); + Host.CheckDecode(FloatUtils.IsFinite(_standardErrors[i]) && _standardErrors[i] >= 0); - TValues = ctx.Reader.ReadDoubleArray(m); - TValueCheckDecode(Bias, TValues[0]); + _tValues = ctx.Reader.ReadDoubleArray(m); + TValueCheckDecode(Bias, _tValues[0]); var weightValues = Weight.GetValues(); for (int i = 1; i < m; ++i) - TValueCheckDecode(weightValues[i - 1], TValues[i]); + TValueCheckDecode(weightValues[i - 1], _tValues[i]); - PValues = ctx.Reader.ReadDoubleArray(m); + _pValues = ctx.Reader.ReadDoubleArray(m); for (int i = 0; i < m; ++i) - ProbCheckDecode(PValues[i]); + ProbCheckDecode(_pValues[i]); } private protected override void SaveCore(ModelSaveContext ctx) @@ -682,15 +688,15 @@ private protected override void SaveCore(ModelSaveContext ctx) ctx.Writer.WriteBoolByte(HasStatistics); if (!HasStatistics) { - Contracts.Assert(StandardErrors == null & TValues == null & PValues == null); + Contracts.Assert(_standardErrors == null & _tValues == null & _pValues == null); return; } - Contracts.Assert(Weight.Length + 1 == StandardErrors.Count); - Contracts.Assert(Weight.Length + 1 == TValues.Count); - Contracts.Assert(Weight.Length + 1 == PValues.Count); - ctx.Writer.WriteDoublesNoCount(StandardErrors as double[]); - ctx.Writer.WriteDoublesNoCount(TValues as double[]); - ctx.Writer.WriteDoublesNoCount(PValues as double[]); + Contracts.Assert(Weight.Length + 1 == _standardErrors.Length); + Contracts.Assert(Weight.Length + 1 == _tValues.Length); + Contracts.Assert(Weight.Length + 1 == _pValues.Length); + ctx.Writer.WriteDoublesNoCount(_standardErrors); + ctx.Writer.WriteDoublesNoCount(_tValues); + ctx.Writer.WriteDoublesNoCount(_pValues); } private static void TValueCheckDecode(Double param, Double tvalue) @@ -725,14 +731,14 @@ private protected override void SaveSummary(TextWriter writer, RoleMappedSchema writer.WriteLine(); writer.WriteLine("Index\tName\tWeight\tStdErr\tt-Value\tp-Value"); const string format = "{0}\t{1}\t{2}\t{3:g4}\t{4:g4}\t{5:e4}"; - writer.WriteLine(format, "", "Bias", Bias, StandardErrors[0], TValues[0], PValues[0]); + writer.WriteLine(format, "", "Bias", Bias, _standardErrors[0], _tValues[0], _pValues[0]); Contracts.Assert(Weight.IsDense); var coeffs = Weight.GetValues(); for (int i = 0; i < coeffs.Length; i++) { var name = names.GetItemOrDefault(i); writer.WriteLine(format, i, name.IsEmpty ? $"f{i}" : name.ToString(), - coeffs[i], StandardErrors[i + 1], TValues[i + 1], PValues[i + 1]); + coeffs[i], _standardErrors[i + 1], _tValues[i + 1], _pValues[i + 1]); } } else diff --git a/src/Microsoft.ML.Recommender/MatrixFactorizationPredictor.cs b/src/Microsoft.ML.Recommender/MatrixFactorizationPredictor.cs index d4cb435903..77755ba91e 100644 --- a/src/Microsoft.ML.Recommender/MatrixFactorizationPredictor.cs +++ b/src/Microsoft.ML.Recommender/MatrixFactorizationPredictor.cs @@ -64,15 +64,19 @@ private static VersionInfo GetVersionInfo() /// This is two dimensional matrix with size of * flattened into one-dimensional matrix. /// Row by row. /// - public readonly IReadOnlyList LeftFactorMatrix; + public IReadOnlyList LeftFactorMatrix => _leftFactorMatrix; + + private readonly float[] _leftFactorMatrix; /// - /// Left approximation matrix + /// Right approximation matrix /// /// /// This is two dimensional matrix with size of * flattened into one-dimensional matrix. /// Row by row. /// - public readonly IReadOnlyList RightFactorMatrix; + public IReadOnlyList RightFactorMatrix => _rightFactorMatrix; + + private readonly float[] _rightFactorMatrix; PredictionKind IPredictor.PredictionKind => PredictionKind.Recommendation; @@ -91,12 +95,12 @@ internal MatrixFactorizationModelParameters(IHostEnvironment env, SafeTrainingAn _host.CheckValue(matrixColumnIndexType, nameof(matrixColumnIndexType)); _host.CheckValue(matrixRowIndexType, nameof(matrixRowIndexType)); buffer.Get(out NumberOfRows, out NumberOfColumns, out ApproximationRank, out var leftFactorMatrix, out var rightFactorMatrix); - LeftFactorMatrix = leftFactorMatrix; - RightFactorMatrix = rightFactorMatrix; + _leftFactorMatrix = leftFactorMatrix; + _rightFactorMatrix = rightFactorMatrix; _host.Assert(NumberOfColumns == matrixColumnIndexType.GetCountAsInt32(_host)); _host.Assert(NumberOfRows == matrixRowIndexType.GetCountAsInt32(_host)); - _host.Assert(LeftFactorMatrix.Count == NumberOfRows * ApproximationRank); - _host.Assert(RightFactorMatrix.Count == ApproximationRank * NumberOfColumns); + _host.Assert(_leftFactorMatrix.Length == NumberOfRows * ApproximationRank); + _host.Assert(_rightFactorMatrix.Length == ApproximationRank * NumberOfColumns); MatrixColumnIndexType = matrixColumnIndexType; MatrixRowIndexType = matrixRowIndexType; @@ -134,8 +138,8 @@ private MatrixFactorizationModelParameters(IHostEnvironment env, ModelLoadContex ApproximationRank = ctx.Reader.ReadInt32(); _host.CheckDecode(ApproximationRank > 0); - LeftFactorMatrix = Utils.ReadSingleArray(ctx.Reader, checked(NumberOfRows * ApproximationRank)); - RightFactorMatrix = Utils.ReadSingleArray(ctx.Reader, checked(NumberOfColumns * ApproximationRank)); + _leftFactorMatrix = Utils.ReadSingleArray(ctx.Reader, checked(NumberOfRows * ApproximationRank)); + _rightFactorMatrix = Utils.ReadSingleArray(ctx.Reader, checked(NumberOfColumns * ApproximationRank)); MatrixColumnIndexType = new KeyType(typeof(uint), NumberOfColumns); MatrixRowIndexType = new KeyType(typeof(uint), NumberOfRows); @@ -173,10 +177,10 @@ void ICanSaveModel.Save(ModelSaveContext ctx) ctx.Writer.Write(NumberOfRows); ctx.Writer.Write(NumberOfColumns); ctx.Writer.Write(ApproximationRank); - _host.Check(Utils.Size(LeftFactorMatrix) == NumberOfRows * ApproximationRank, "Unexpected matrix size of a factor matrix (matrix P in LIBMF paper)"); - _host.Check(Utils.Size(RightFactorMatrix) == NumberOfColumns * ApproximationRank, "Unexpected matrix size of a factor matrix (matrix Q in LIBMF paper)"); - Utils.WriteSinglesNoCount(ctx.Writer, LeftFactorMatrix as float[]); - Utils.WriteSinglesNoCount(ctx.Writer, RightFactorMatrix as float[]); + _host.Check(Utils.Size(_leftFactorMatrix) == NumberOfRows * ApproximationRank, "Unexpected matrix size of a factor matrix (matrix P in LIBMF paper)"); + _host.Check(Utils.Size(_rightFactorMatrix) == NumberOfColumns * ApproximationRank, "Unexpected matrix size of a factor matrix (matrix Q in LIBMF paper)"); + Utils.WriteSinglesNoCount(ctx.Writer, _leftFactorMatrix); + Utils.WriteSinglesNoCount(ctx.Writer, _rightFactorMatrix); } /// @@ -186,18 +190,18 @@ void ICanSaveInTextFormat.SaveAsText(TextWriter writer, RoleMappedSchema schema) { writer.WriteLine("# Imputed matrix is P * Q'"); writer.WriteLine("# P in R^({0} x {1}), rows correpond to Y item", NumberOfRows, ApproximationRank); - for (int i = 0; i < LeftFactorMatrix.Count; ++i) + for (int i = 0; i < _leftFactorMatrix.Length; ++i) { - writer.Write(LeftFactorMatrix[i].ToString("G")); + writer.Write(_leftFactorMatrix[i].ToString("G")); if (i % ApproximationRank == ApproximationRank - 1) writer.WriteLine(); else writer.Write('\t'); } writer.WriteLine("# Q in R^({0} x {1}), rows correpond to X item", NumberOfColumns, ApproximationRank); - for (int i = 0; i < RightFactorMatrix.Count; ++i) + for (int i = 0; i < _rightFactorMatrix.Length; ++i) { - writer.Write(RightFactorMatrix[i].ToString("G")); + writer.Write(_rightFactorMatrix[i].ToString("G")); if (i % ApproximationRank == ApproximationRank - 1) writer.WriteLine(); else @@ -272,7 +276,7 @@ private float Score(int columnIndex, int rowIndex) // Starting position of the columnIndex-th column in the right factor factor matrix int columnOffset = columnIndex * ApproximationRank; for (int i = 0; i < ApproximationRank; i++) - score += LeftFactorMatrix[rowOffset + i] * RightFactorMatrix[columnOffset + i]; + score += _leftFactorMatrix[rowOffset + i] * _rightFactorMatrix[columnOffset + i]; return score; }