-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Creation of components through MLContext and cleanup (text transform) #2394
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
c10ca38
22c9427
7b04a30
c1b2b67
136abd9
61adaa8
1ec8739
5508581
47fee69
401fedf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,34 +20,32 @@ public static class TextCatalog | |
/// <param name="catalog">The text-related transform's catalog.</param> | ||
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param> | ||
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param> | ||
/// <param name="advancedSettings">Advanced transform settings</param> | ||
/// <example> | ||
/// <format type="text/markdown"> | ||
/// <] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
you removing this example becaue it doesn't use this exact method? #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeap. removed it from here, and added to the API below (this was one of Senja's comments) In reply to: 254802756 [](ancestors = 254802756) |
||
/// ]]> | ||
/// </format> | ||
/// </example> | ||
public static TextFeaturizingEstimator FeaturizeText(this TransformsCatalog.TextTransforms catalog, | ||
string outputColumnName, | ||
string inputColumnName = null, | ||
Action<TextFeaturizingEstimator.Settings> advancedSettings = null) | ||
string inputColumnName = null) | ||
=> new TextFeaturizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), | ||
outputColumnName, inputColumnName, advancedSettings); | ||
outputColumnName, inputColumnName); | ||
|
||
/// <summary> | ||
/// Transform several text columns into featurized float array that represents counts of ngrams and char-grams. | ||
/// </summary> | ||
/// <param name="catalog">The text-related transform's catalog.</param> | ||
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnNames"/>.</param> | ||
/// <param name="inputColumnNames">Name of the columns to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param> | ||
/// <param name="advancedSettings">Advanced transform settings</param> | ||
/// <param name="options">Advanced options to the algorithm.</param> | ||
/// <example> | ||
/// <format type="text/markdown"> | ||
/// <] | ||
/// ]]> | ||
/// </format> | ||
/// </example> | ||
public static TextFeaturizingEstimator FeaturizeText(this TransformsCatalog.TextTransforms catalog, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
link the sample to this one, since it illustrates the usage of Options. #Resolved |
||
string outputColumnName, | ||
IEnumerable<string> inputColumnNames, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I know it is not your change, but can we go with params, rather than an IEnumerable? Looking at the sample, having to construct a List is overkill. #Pending There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
Action<TextFeaturizingEstimator.Settings> advancedSettings = null) | ||
TextFeaturizingEstimator.Options options) | ||
=> new TextFeaturizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), | ||
outputColumnName, inputColumnNames, advancedSettings); | ||
outputColumnName, inputColumnNames, options); | ||
|
||
/// <summary> | ||
/// Tokenize incoming text in <paramref name="inputColumnName"/> and output the tokens as <paramref name="outputColumnName"/>. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -120,26 +120,59 @@ internal sealed class Arguments : TransformInputBase | |
public TextNormKind VectorNormalizer = TextNormKind.L2; | ||
} | ||
|
||
public sealed class Settings | ||
/// <summary> | ||
/// Advanced options for the <see cref="TextFeaturizingEstimator"/>. | ||
/// </summary> | ||
public sealed class Options | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add a summary tag for each of these settings, and for the Options class? You can take them from the constructor I believe. #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
can you remove this, and use the Arguments above, like for everything else? make the Factories internal, so they are not visible. #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we don't want to do that. In reply to: 254358363 [](ancestors = 254358363) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. perhaps we should do some of this cleanup outside purview of this PR ... I need to understand properly how the factories are used etc. In reply to: 254461580 [](ancestors = 254461580,254358363) |
||
{ | ||
#pragma warning disable MSML_NoInstanceInitializers // No initializers on instance fields or properties | ||
/// <summary> | ||
/// Dataset language. | ||
/// </summary> | ||
public Language TextLanguage { get; set; } = DefaultLanguage; | ||
/// <summary> | ||
/// Casing used for the text. | ||
/// </summary> | ||
public CaseNormalizationMode TextCase { get; set; } = CaseNormalizationMode.Lower; | ||
/// <summary> | ||
/// Whether to keep diacritical marks or remove them. | ||
/// </summary> | ||
public bool KeepDiacritics { get; set; } = false; | ||
/// <summary> | ||
/// Whether to keep punctuation marks or remove them. | ||
/// </summary> | ||
public bool KeepPunctuations { get; set; } = true; | ||
/// <summary> | ||
/// Whether to keep numbers or remove them. | ||
/// </summary> | ||
public bool KeepNumbers { get; set; } = true; | ||
/// <summary> | ||
/// Whether to output the transformed text tokens as an additional column. | ||
/// </summary> | ||
public bool OutputTokens { get; set; } = false; | ||
/// <summary> | ||
/// Vector Normalizer to use. | ||
/// </summary> | ||
public TextNormKind VectorNormalizer { get; set; } = TextNormKind.L2; | ||
/// <summary> | ||
/// Whether to use stop remover or not. | ||
/// </summary> | ||
public bool UseStopRemover { get; set; } = false; | ||
/// <summary> | ||
/// Whether to use char extractor or not. | ||
/// </summary> | ||
public bool UseCharExtractor { get; set; } = true; | ||
/// <summary> | ||
/// Whether to use word extractor or not. | ||
/// </summary> | ||
public bool UseWordExtractor { get; set; } = true; | ||
#pragma warning restore MSML_NoInstanceInitializers // No initializers on instance fields or properties | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This whole thing requires proper cleaning/documentation. Do you want to do it in this PR, or you prefer leave it for later cleaning? #Pending There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The current PR is to fix the public API We are doing some cleaning/doc fixes on an opportunistic basis. But we should have a separate issue / PRs for "proper" cleaning and documentation In reply to: 254013725 [](ancestors = 254013725) |
||
|
||
public readonly string OutputColumn; | ||
internal readonly string OutputColumn; | ||
private readonly string[] _inputColumns; | ||
public IReadOnlyCollection<string> InputColumns => _inputColumns.AsReadOnly(); | ||
public Settings AdvancedSettings { get; } | ||
internal IReadOnlyCollection<string> InputColumns => _inputColumns.AsReadOnly(); | ||
internal Options OptionalSettings { get; } | ||
|
||
// These parameters are hardcoded for now. | ||
// REVIEW: expose them once sub-transforms are estimators. | ||
|
@@ -232,18 +265,18 @@ public bool NeedInitialSourceColumnConcatTransform | |
public TransformApplierParams(TextFeaturizingEstimator parent) | ||
{ | ||
var host = parent._host; | ||
host.Check(Enum.IsDefined(typeof(Language), parent.AdvancedSettings.TextLanguage)); | ||
host.Check(Enum.IsDefined(typeof(CaseNormalizationMode), parent.AdvancedSettings.TextCase)); | ||
host.Check(Enum.IsDefined(typeof(Language), parent.OptionalSettings.TextLanguage)); | ||
host.Check(Enum.IsDefined(typeof(CaseNormalizationMode), parent.OptionalSettings.TextCase)); | ||
WordExtractorFactory = parent._wordFeatureExtractor?.CreateComponent(host, parent._dictionary); | ||
CharExtractorFactory = parent._charFeatureExtractor?.CreateComponent(host, parent._dictionary); | ||
VectorNormalizer = parent.AdvancedSettings.VectorNormalizer; | ||
Language = parent.AdvancedSettings.TextLanguage; | ||
UsePredefinedStopWordRemover = parent.AdvancedSettings.UseStopRemover; | ||
TextCase = parent.AdvancedSettings.TextCase; | ||
KeepDiacritics = parent.AdvancedSettings.KeepDiacritics; | ||
KeepPunctuations = parent.AdvancedSettings.KeepPunctuations; | ||
KeepNumbers = parent.AdvancedSettings.KeepNumbers; | ||
OutputTextTokens = parent.AdvancedSettings.OutputTokens; | ||
VectorNormalizer = parent.OptionalSettings.VectorNormalizer; | ||
Language = parent.OptionalSettings.TextLanguage; | ||
UsePredefinedStopWordRemover = parent.OptionalSettings.UseStopRemover; | ||
TextCase = parent.OptionalSettings.TextCase; | ||
KeepDiacritics = parent.OptionalSettings.KeepDiacritics; | ||
KeepPunctuations = parent.OptionalSettings.KeepPunctuations; | ||
KeepNumbers = parent.OptionalSettings.KeepNumbers; | ||
OutputTextTokens = parent.OptionalSettings.OutputTokens; | ||
Dictionary = parent._dictionary; | ||
} | ||
} | ||
|
@@ -254,40 +287,42 @@ public TransformApplierParams(TextFeaturizingEstimator parent) | |
internal const string UserName = "Text Transform"; | ||
internal const string LoaderSignature = "Text"; | ||
|
||
public const Language DefaultLanguage = Language.English; | ||
internal const Language DefaultLanguage = Language.English; | ||
|
||
private const string TransformedTextColFormat = "{0}_TransformedText"; | ||
|
||
public TextFeaturizingEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null, | ||
Action<Settings> advancedSettings = null) | ||
: this(env, outputColumnName, new[] { inputColumnName ?? outputColumnName }, advancedSettings) | ||
internal TextFeaturizingEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null) | ||
: this(env, outputColumnName, new[] { inputColumnName ?? outputColumnName }) | ||
{ | ||
} | ||
|
||
public TextFeaturizingEstimator(IHostEnvironment env, string name, IEnumerable<string> source, | ||
Action<Settings> advancedSettings = null) | ||
internal TextFeaturizingEstimator(IHostEnvironment env, string name, IEnumerable<string> source, Options options = null) | ||
{ | ||
Contracts.CheckValue(env, nameof(env)); | ||
_host = env.Register(nameof(TextFeaturizingEstimator)); | ||
_host.CheckValue(source, nameof(source)); | ||
_host.CheckParam(source.Any(), nameof(source)); | ||
_host.CheckParam(!source.Any(string.IsNullOrWhiteSpace), nameof(source)); | ||
_host.CheckNonEmpty(name, nameof(name)); | ||
_host.CheckValueOrNull(advancedSettings); | ||
_host.CheckValueOrNull(options); | ||
|
||
_inputColumns = source.ToArray(); | ||
OutputColumn = name; | ||
|
||
AdvancedSettings = new Settings(); | ||
advancedSettings?.Invoke(AdvancedSettings); | ||
OptionalSettings = new Options(); | ||
if (options != null) | ||
OptionalSettings = options; | ||
|
||
_dictionary = null; | ||
if (AdvancedSettings.UseWordExtractor) | ||
if (OptionalSettings.UseWordExtractor) | ||
_wordFeatureExtractor = new NgramExtractorTransform.NgramExtractorArguments(); | ||
if (AdvancedSettings.UseCharExtractor) | ||
if (OptionalSettings.UseCharExtractor) | ||
_charFeatureExtractor = new NgramExtractorTransform.NgramExtractorArguments() { NgramLength = 3, AllLengths = false }; | ||
} | ||
|
||
/// <summary> | ||
/// Trains and returns a <see cref="Transformer"/>. | ||
/// </summary> | ||
public ITransformer Fit(IDataView input) | ||
{ | ||
var h = _host; | ||
|
@@ -463,14 +498,18 @@ public ITransformer Fit(IDataView input) | |
return new Transformer(_host, input, view); | ||
} | ||
|
||
public static ITransformer Create(IHostEnvironment env, ModelLoadContext ctx) | ||
private static ITransformer Create(IHostEnvironment env, ModelLoadContext ctx) | ||
=> new Transformer(env, ctx); | ||
|
||
private static string GenerateColumnName(Schema schema, string srcName, string xfTag) | ||
{ | ||
return schema.GetTempColumnName(string.Format("{0}_{1}", srcName, xfTag)); | ||
} | ||
|
||
/// <summary> | ||
/// Returns the <see cref="SchemaShape"/> of the schema which will be produced by the transformer. | ||
/// Used for schema propagation and verification in a pipeline. | ||
/// </summary> | ||
public SchemaShape GetOutputSchema(SchemaShape inputSchema) | ||
{ | ||
_host.CheckValue(inputSchema, nameof(inputSchema)); | ||
|
@@ -485,12 +524,12 @@ public SchemaShape GetOutputSchema(SchemaShape inputSchema) | |
|
||
var metadata = new List<SchemaShape.Column>(2); | ||
metadata.Add(new SchemaShape.Column(MetadataUtils.Kinds.SlotNames, SchemaShape.Column.VectorKind.Vector, TextType.Instance, false)); | ||
if (AdvancedSettings.VectorNormalizer != TextNormKind.None) | ||
if (OptionalSettings.VectorNormalizer != TextNormKind.None) | ||
metadata.Add(new SchemaShape.Column(MetadataUtils.Kinds.IsNormalized, SchemaShape.Column.VectorKind.Scalar, BoolType.Instance, false)); | ||
|
||
result[OutputColumn] = new SchemaShape.Column(OutputColumn, SchemaShape.Column.VectorKind.Vector, NumberType.R4, false, | ||
new SchemaShape(metadata)); | ||
if (AdvancedSettings.OutputTokens) | ||
if (OptionalSettings.OutputTokens) | ||
{ | ||
string name = string.Format(TransformedTextColFormat, OutputColumn); | ||
result[name] = new SchemaShape.Column(name, SchemaShape.Column.VectorKind.VariableVector, TextType.Instance, false); | ||
|
@@ -502,18 +541,18 @@ public SchemaShape GetOutputSchema(SchemaShape inputSchema) | |
// Factory method for SignatureDataTransform. | ||
internal static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView data) | ||
{ | ||
Action<Settings> settings = s => | ||
var settings = new Options | ||
{ | ||
s.TextLanguage = args.Language; | ||
s.TextCase = args.TextCase; | ||
s.KeepDiacritics = args.KeepDiacritics; | ||
s.KeepPunctuations = args.KeepPunctuations; | ||
s.KeepNumbers = args.KeepNumbers; | ||
s.OutputTokens = args.OutputTokens; | ||
s.VectorNormalizer = args.VectorNormalizer; | ||
s.UseStopRemover = args.UsePredefinedStopWordRemover; | ||
s.UseWordExtractor = args.WordFeatureExtractor != null; | ||
s.UseCharExtractor = args.CharFeatureExtractor != null; | ||
TextLanguage = args.Language, | ||
TextCase = args.TextCase, | ||
KeepDiacritics = args.KeepDiacritics, | ||
KeepPunctuations = args.KeepPunctuations, | ||
KeepNumbers = args.KeepNumbers, | ||
OutputTokens = args.OutputTokens, | ||
VectorNormalizer = args.VectorNormalizer, | ||
UseStopRemover = args.UsePredefinedStopWordRemover, | ||
UseWordExtractor = args.WordFeatureExtractor != null, | ||
UseCharExtractor = args.CharFeatureExtractor != null, | ||
}; | ||
|
||
var estimator = new TextFeaturizingEstimator(env, args.Columns.Name, args.Columns.Source ?? new[] { args.Columns.Name }, settings); | ||
|
@@ -530,7 +569,7 @@ private sealed class Transformer : ITransformer, ICanSaveModel | |
private readonly IHost _host; | ||
private readonly IDataView _xf; | ||
|
||
public Transformer(IHostEnvironment env, IDataView input, IDataView view) | ||
internal Transformer(IHostEnvironment env, IDataView input, IDataView view) | ||
{ | ||
_host = env.Register(nameof(Transformer)); | ||
_xf = ApplyTransformUtils.ApplyAllTransformsToData(_host, view, new EmptyDataView(_host, input.Schema), input); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
umm :) #Resolved
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
issue #2460 to discuss this
In reply to: 254357706 [](ancestors = 254357706)