SLEP006 on Sample Properties (#16)

jnothman · web-flow · commit 677293140f8b · 2020-06-29T08:20:04.000+02:00
* Starting to draft SLEP006 on Sample Properties

* iter

* WIP

* WIP

* a fourth solution and a little more fleshing... still no code examples.

* Code examples using Solution 4

* A couple of cross-references

* WIP

* Filling out example code

* Note handling of misspelled keys

* Note the status quo hacks

* new code

* Small additions including section on nomenclature

* Some more thoughts on backwards compatibility

* Note on potential for mixed keys
diff --git a/conf.py b/conf.py
@@ -42,6 +42,7 @@
     'sphinx.ext.intersphinx',
     'sphinx.ext.mathjax',
     'sphinx.ext.viewcode',
+    'sphinx_issues',
 ]
 
 # Add any paths that contain templates here, relative to this directory.
@@ -165,3 +166,7 @@
 # -- Options for intersphinx extension ---------------------------------------
 
 intersphinx_mapping = {'sklearn': ('http://scikit-learn.org/stable', None)}
+
+# -- Sphinx-Issues configuration --
+
+issues_github_path = "scikit-learn/scikit-learn"
diff --git a/index.rst b/index.rst
@@ -29,6 +29,7 @@
     slep002/proposal
     slep003/proposal
     slep004/proposal
+    slep006/proposal
 
 .. toctree::
     :maxdepth: 1
diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,3 @@
 sphinx
 sphinx-rtd-theme
+sphinx-issues
diff --git a/slep006/cases_opt0a.py b/slep006/cases_opt0a.py
@@ -0,0 +1,6 @@
+from defs import (accuracy, group_cv, make_scorer, SelectKBest,
+                  LogisticRegressionCV, cross_validate,
+                  make_pipeline, X, y, my_groups, my_weights,
+                  my_other_weights)
+
+# TODO
diff --git a/slep006/cases_opt0b.py b/slep006/cases_opt0b.py
@@ -0,0 +1,7 @@
+import pandas as pd
+from defs import (accuracy, group_cv, make_scorer, SelectKBest,
+                  LogisticRegressionCV, cross_validate,
+                  make_pipeline, X, y, my_groups, my_weights,
+                  my_other_weights)
+
+# TODO
diff --git a/slep006/cases_opt1.py b/slep006/cases_opt1.py
@@ -0,0 +1,68 @@
+from defs import (accuracy, group_cv, make_scorer, SelectKBest,
+                  LogisticRegressionCV, cross_validate, make_pipeline, X, y,
+                  my_groups, my_weights, my_other_weights)
+
+# %%
+# Case A: weighted scoring and fitting
+
+lr = LogisticRegressionCV(
+    cv=group_cv,
+    scoring='accuracy',
+)
+cross_validate(lr, X, y, cv=group_cv,
+               props={'sample_weight': my_weights, 'groups': my_groups},
+               scoring='accuracy')
+
+# Error handling: if props={'sample_eight': my_weights, ...} was passed
+# instead, the estimator would fit and score without weight, silently failing.
+
+# %%
+# Case B: weighted scoring and unweighted fitting
+
+
+class MyLogisticRegressionCV(LogisticRegressionCV):
+    def fit(self, X, y, props=None):
+        props = props.copy()
+        props.pop('sample_weight', None)
+        super().fit(X, y, props=props)
+
+
+# %%
+# Case C: unweighted feature selection
+
+# Currently feature selection does not handle sample_weight, and as long as
+# that remains the case, it will simply ignore the prop passed to it. Hence:
+
+lr = LogisticRegressionCV(
+    cv=group_cv,
+    scoring='accuracy',
+)
+sel = SelectKBest()
+pipe = make_pipeline(sel, lr)
+cross_validate(pipe, X, y, cv=group_cv,
+               props={'sample_weight': my_weights, 'groups': my_groups},
+               scoring='accuracy')
+
+# %%
+# Case D: different scoring and fitting weights
+
+weighted_acc = make_scorer(accuracy)
+
+
+def specially_weighted_acc(est, X, y, props):
+    props = props.copy()
+    props['sample_weight'] = 'scoring_weight'
+    return weighted_acc(est, X, y, props)
+
+
+lr = LogisticRegressionCV(
+    cv=group_cv,
+    scoring=specially_weighted_acc,
+)
+cross_validate(lr, X, y, cv=group_cv,
+               props={
+                    'scoring_weight': my_weights,
+                    'sample_weight': my_other_weights,
+                    'groups': my_groups,
+               },
+               scoring=specially_weighted_acc)
diff --git a/slep006/cases_opt2.py b/slep006/cases_opt2.py
@@ -0,0 +1,70 @@
+from defs import (group_cv, SelectKBest, LogisticRegressionCV,
+                  cross_validate, make_pipeline, X, y, my_groups,
+                  my_weights, my_other_weights)
+
+# %%
+# Case A: weighted scoring and fitting
+
+lr = LogisticRegressionCV(
+    cv=group_cv,
+    scoring='accuracy',
+)
+props = {'cv__groups': my_groups,
+         'estimator__cv__groups': my_groups,
+         'estimator__sample_weight': my_weights,
+         'scoring__sample_weight': my_weights,
+         'estimator__scoring__sample_weight': my_weights}
+cross_validate(lr, X, y, cv=group_cv,
+               props=props,
+               scoring='accuracy')
+
+# error handling: if props={'estimator__sample_eight': my_weights, ...} was
+# passed instead, the estimator would raise an error.
+
+# %%
+# Case B: weighted scoring and unweighted fitting
+
+lr = LogisticRegressionCV(
+    cv=group_cv,
+    scoring='accuracy',
+)
+props = {'cv__groups': my_groups,
+         'estimator__cv__groups': my_groups,
+         'scoring__sample_weight': my_weights,
+         'estimator__scoring__sample_weight': my_weights}
+cross_validate(lr, X, y, cv=group_cv,
+               props=props,
+               scoring='accuracy')
+
+# %%
+# Case C: unweighted feature selection
+
+lr = LogisticRegressionCV(
+    cv=group_cv,
+    scoring='accuracy',
+)
+pipe = make_pipeline(SelectKBest(), lr)
+props = {'cv__groups': my_groups,
+         'estimator__logisticregressioncv__cv__groups': my_groups,
+         'estimator__logisticregressioncv__sample_weight': my_weights,
+         'scoring__sample_weight': my_weights,
+         'estimator__scoring__sample_weight': my_weights}
+cross_validate(pipe, X, y, cv=group_cv,
+               props=props,
+               scoring='accuracy')
+
+# %%
+# Case D: different scoring and fitting weights
+
+lr = LogisticRegressionCV(
+    cv=group_cv,
+    scoring='accuracy',
+)
+props = {'cv__groups': my_groups,
+         'estimator__cv__groups': my_groups,
+         'estimator__sample_weight': my_other_weights,
+         'scoring__sample_weight': my_weights,
+         'estimator__scoring__sample_weight': my_weights}
+cross_validate(lr, X, y, cv=group_cv,
+               props=props,
+               scoring='accuracy')
diff --git a/slep006/cases_opt3.py b/slep006/cases_opt3.py
@@ -0,0 +1,99 @@
+from defs import (accuracy, make_scorer, SelectKBest, LogisticRegressionCV,
+                  group_cv, cross_validate, make_pipeline, X, y, my_groups,
+                  my_weights, my_other_weights)
+
+# %%
+# Case A: weighted scoring and fitting
+
+lr = LogisticRegressionCV(
+    cv=group_cv,
+    scoring='accuracy',
+    prop_routing={'cv': ['groups'],
+                  'scoring': ['sample_weight'],
+                  }
+    # one question here is whether we need to explicitly route sample_weight
+    # to LogisticRegressionCV's fitting...
+)
+
+# Alternative syntax, which assumes cv receives 'groups' by default, and that a
+# method-based API is provided on meta-estimators:
+#   lr = LogisticRegressionCV(
+#       cv=group_cv,
+#       scoring='accuracy',
+#   ).add_prop_route(scoring='sample_weight')
+
+cross_validate(lr, X, y, cv=group_cv,
+               props={'sample_weight': my_weights, 'groups': my_groups},
+               scoring='accuracy',
+               prop_routing={'estimator': '*',  # pass all props
+                             'cv': ['groups'],
+                             'scoring': ['sample_weight'],
+                             })
+
+# Error handling: if props={'sample_eight': my_weights, ...} was passed
+# instead, LogisticRegressionCV would have to identify that a key was passed
+# that could not be routed nor used, in order to raise an error.
+
+# %%
+# Case B: weighted scoring and unweighted fitting
+
+# Here we rename the sample_weight prop so that we can specify that it only
+# applies to scoring.
+lr = LogisticRegressionCV(
+    cv=group_cv,
+    scoring='accuracy',
+    prop_routing={'cv': ['groups'],
+                  # read the following as "scoring should consume
+                  # 'scoring_weight' as if it were 'sample_weight'."
+                  'scoring': {'sample_weight': 'scoring_weight'},
+                  },
+)
+cross_validate(lr, X, y, cv=group_cv,
+               props={'scoring_weight': my_weights, 'groups': my_groups},
+               scoring='accuracy',
+               prop_routing={'estimator': '*',
+                             'cv': ['groups'],
+                             'scoring': {'sample_weight': 'scoring_weight'},
+                             })
+
+# %%
+# Case C: unweighted feature selection
+
+lr = LogisticRegressionCV(
+    cv=group_cv,
+    scoring='accuracy',
+    prop_routing={'cv': ['groups'],
+                  'scoring': ['sample_weight'],
+                  })
+pipe = make_pipeline(SelectKBest(), lr,
+                     prop_routing={'logisticregressioncv': ['sample_weight',
+                                                            'groups']})
+cross_validate(lr, X, y, cv=group_cv,
+               props={'sample_weight': my_weights, 'groups': my_groups},
+               scoring='accuracy',
+               prop_routing={'estimator': '*',
+                             'cv': ['groups'],
+                             'scoring': ['sample_weight'],
+                             })
+
+# %%
+# Case D: different scoring and fitting weights
+lr = LogisticRegressionCV(
+    cv=group_cv,
+    scoring='accuracy',
+    prop_routing={'cv': ['groups'],
+                  # read the following as "scoring should consume
+                  # 'scoring_weight' as if it were 'sample_weight'."
+                  'scoring': {'sample_weight': 'scoring_weight'},
+                  },
+)
+cross_validate(lr, X, y, cv=group_cv,
+               props={'scoring_weight': my_weights, 'groups': my_groups,
+                      'fitting_weight': my_other_weights},
+               scoring='accuracy',
+               prop_routing={'estimator': {'sample_weight': 'fitting_weight',
+                                           'scoring_weight': 'scoring_weight',
+                                           'groups': 'groups'},
+                             'cv': ['groups'],
+                             'scoring': {'sample_weight': 'scoring_weight'},
+                             })
diff --git a/slep006/cases_opt4.py b/slep006/cases_opt4.py
@@ -0,0 +1,78 @@
+from defs import (accuracy, group_cv, make_scorer, SelectKBest,
+                  LogisticRegressionCV, cross_validate,
+                  make_pipeline, X, y, my_groups, my_weights,
+                  my_other_weights)
+
+# %%
+# Case A: weighted scoring and fitting
+
+# Here we presume that GroupKFold requests `groups` by default.
+# We need to explicitly request weights in make_scorer and for
+# LogisticRegressionCV. Both of these consumers understand the meaning
+# of the key "sample_weight".
+
+weighted_acc = make_scorer(accuracy, request_props=['sample_weight'])
+lr = LogisticRegressionCV(
+    cv=group_cv,
+    scoring=weighted_acc,
+).set_props_request(['sample_weight'])
+cross_validate(lr, X, y, cv=group_cv,
+               props={'sample_weight': my_weights, 'groups': my_groups},
+               scoring=weighted_acc)
+
+# Error handling: if props={'sample_eight': my_weights, ...} was passed,
+# cross_validate would raise an error, since 'sample_eight' was not requested
+# by any of its children.
+
+# %%
+# Case B: weighted scoring and unweighted fitting
+
+# Since LogisticRegressionCV requires that weights explicitly be requested,
+# removing that request means the fitting is unweighted.
+
+weighted_acc = make_scorer(accuracy, request_props=['sample_weight'])
+lr = LogisticRegressionCV(
+    cv=group_cv,
+    scoring=weighted_acc,
+)
+cross_validate(lr, X, y, cv=group_cv,
+               props={'sample_weight': my_weights, 'groups': my_groups},
+               scoring=weighted_acc)
+
+# %%
+# Case C: unweighted feature selection
+
+# Like LogisticRegressionCV, SelectKBest needs to request weights explicitly.
+# Here it does not request them.
+
+weighted_acc = make_scorer(accuracy, request_props=['sample_weight'])
+lr = LogisticRegressionCV(
+    cv=group_cv,
+    scoring=weighted_acc,
+).set_props_request(['sample_weight'])
+sel = SelectKBest()
+pipe = make_pipeline(sel, lr)
+cross_validate(pipe, X, y, cv=group_cv,
+               props={'sample_weight': my_weights, 'groups': my_groups},
+               scoring=weighted_acc)
+
+# %%
+# Case D: different scoring and fitting weights
+
+# Despite make_scorer and LogisticRegressionCV both expecting a key
+# sample_weight, we can use aliases to pass different weights to different
+# consumers.
+
+weighted_acc = make_scorer(accuracy,
+                           request_props={'scoring_weight': 'sample_weight'})
+lr = LogisticRegressionCV(
+    cv=group_cv,
+    scoring=weighted_acc,
+).set_props_request({'fitting_weight': "sample_weight"})
+cross_validate(lr, X, y, cv=group_cv,
+               props={
+                    'scoring_weight': my_weights,
+                    'fitting_weight': my_other_weights,
+                    'groups': my_groups,
+               },
+               scoring=weighted_acc)
diff --git a/slep006/defs.py b/slep006/defs.py
@@ -0,0 +1,14 @@
+import numpy as np
+from sklearn.feature_selection import SelectKBest
+from sklearn.linear_model import LogisticRegressionCV
+from sklearn.metrics import accuracy
+from sklearn.metrics import make_scorer
+from sklearn.model_selection import GroupKFold, cross_validate
+from sklearn.pipeline import make_pipeline
+
+N, M = 100, 4
+X = np.random.rand(N, M)
+y = np.random.randint(0, 1, size=N)
+my_groups = np.random.randint(0, 10, size=N)
+my_weights = np.random.rand(N)
+my_other_weights = np.random.rand(N)
diff --git a/slep006/proposal.rst b/slep006/proposal.rst

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
`1`	`1`	`sphinx`
`2`	`2`	`sphinx-rtd-theme`
	`3`	`+sphinx-issues`