Skip to content

Commit 6772931

Browse files
authored
SLEP006 on Sample Properties (#16)
* Starting to draft SLEP006 on Sample Properties * iter * WIP * WIP * a fourth solution and a little more fleshing... still no code examples. * Code examples using Solution 4 * A couple of cross-references * WIP * Filling out example code * Note handling of misspelled keys * Note the status quo hacks * new code * Small additions including section on nomenclature * Some more thoughts on backwards compatibility * Note on potential for mixed keys
1 parent 1d57fe0 commit 6772931

File tree

11 files changed

+794
-0
lines changed

11 files changed

+794
-0
lines changed

conf.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
'sphinx.ext.intersphinx',
4343
'sphinx.ext.mathjax',
4444
'sphinx.ext.viewcode',
45+
'sphinx_issues',
4546
]
4647

4748
# Add any paths that contain templates here, relative to this directory.
@@ -165,3 +166,7 @@
165166
# -- Options for intersphinx extension ---------------------------------------
166167

167168
intersphinx_mapping = {'sklearn': ('http://scikit-learn.org/stable', None)}
169+
170+
# -- Sphinx-Issues configuration --
171+
172+
issues_github_path = "scikit-learn/scikit-learn"

index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
slep002/proposal
3030
slep003/proposal
3131
slep004/proposal
32+
slep006/proposal
3233

3334
.. toctree::
3435
:maxdepth: 1

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
sphinx
22
sphinx-rtd-theme
3+
sphinx-issues

slep006/cases_opt0a.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
from defs import (accuracy, group_cv, make_scorer, SelectKBest,
2+
LogisticRegressionCV, cross_validate,
3+
make_pipeline, X, y, my_groups, my_weights,
4+
my_other_weights)
5+
6+
# TODO

slep006/cases_opt0b.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
import pandas as pd
2+
from defs import (accuracy, group_cv, make_scorer, SelectKBest,
3+
LogisticRegressionCV, cross_validate,
4+
make_pipeline, X, y, my_groups, my_weights,
5+
my_other_weights)
6+
7+
# TODO

slep006/cases_opt1.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
from defs import (accuracy, group_cv, make_scorer, SelectKBest,
2+
LogisticRegressionCV, cross_validate, make_pipeline, X, y,
3+
my_groups, my_weights, my_other_weights)
4+
5+
# %%
6+
# Case A: weighted scoring and fitting
7+
8+
lr = LogisticRegressionCV(
9+
cv=group_cv,
10+
scoring='accuracy',
11+
)
12+
cross_validate(lr, X, y, cv=group_cv,
13+
props={'sample_weight': my_weights, 'groups': my_groups},
14+
scoring='accuracy')
15+
16+
# Error handling: if props={'sample_eight': my_weights, ...} was passed
17+
# instead, the estimator would fit and score without weight, silently failing.
18+
19+
# %%
20+
# Case B: weighted scoring and unweighted fitting
21+
22+
23+
class MyLogisticRegressionCV(LogisticRegressionCV):
24+
def fit(self, X, y, props=None):
25+
props = props.copy()
26+
props.pop('sample_weight', None)
27+
super().fit(X, y, props=props)
28+
29+
30+
# %%
31+
# Case C: unweighted feature selection
32+
33+
# Currently feature selection does not handle sample_weight, and as long as
34+
# that remains the case, it will simply ignore the prop passed to it. Hence:
35+
36+
lr = LogisticRegressionCV(
37+
cv=group_cv,
38+
scoring='accuracy',
39+
)
40+
sel = SelectKBest()
41+
pipe = make_pipeline(sel, lr)
42+
cross_validate(pipe, X, y, cv=group_cv,
43+
props={'sample_weight': my_weights, 'groups': my_groups},
44+
scoring='accuracy')
45+
46+
# %%
47+
# Case D: different scoring and fitting weights
48+
49+
weighted_acc = make_scorer(accuracy)
50+
51+
52+
def specially_weighted_acc(est, X, y, props):
53+
props = props.copy()
54+
props['sample_weight'] = 'scoring_weight'
55+
return weighted_acc(est, X, y, props)
56+
57+
58+
lr = LogisticRegressionCV(
59+
cv=group_cv,
60+
scoring=specially_weighted_acc,
61+
)
62+
cross_validate(lr, X, y, cv=group_cv,
63+
props={
64+
'scoring_weight': my_weights,
65+
'sample_weight': my_other_weights,
66+
'groups': my_groups,
67+
},
68+
scoring=specially_weighted_acc)

slep006/cases_opt2.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
from defs import (group_cv, SelectKBest, LogisticRegressionCV,
2+
cross_validate, make_pipeline, X, y, my_groups,
3+
my_weights, my_other_weights)
4+
5+
# %%
6+
# Case A: weighted scoring and fitting
7+
8+
lr = LogisticRegressionCV(
9+
cv=group_cv,
10+
scoring='accuracy',
11+
)
12+
props = {'cv__groups': my_groups,
13+
'estimator__cv__groups': my_groups,
14+
'estimator__sample_weight': my_weights,
15+
'scoring__sample_weight': my_weights,
16+
'estimator__scoring__sample_weight': my_weights}
17+
cross_validate(lr, X, y, cv=group_cv,
18+
props=props,
19+
scoring='accuracy')
20+
21+
# error handling: if props={'estimator__sample_eight': my_weights, ...} was
22+
# passed instead, the estimator would raise an error.
23+
24+
# %%
25+
# Case B: weighted scoring and unweighted fitting
26+
27+
lr = LogisticRegressionCV(
28+
cv=group_cv,
29+
scoring='accuracy',
30+
)
31+
props = {'cv__groups': my_groups,
32+
'estimator__cv__groups': my_groups,
33+
'scoring__sample_weight': my_weights,
34+
'estimator__scoring__sample_weight': my_weights}
35+
cross_validate(lr, X, y, cv=group_cv,
36+
props=props,
37+
scoring='accuracy')
38+
39+
# %%
40+
# Case C: unweighted feature selection
41+
42+
lr = LogisticRegressionCV(
43+
cv=group_cv,
44+
scoring='accuracy',
45+
)
46+
pipe = make_pipeline(SelectKBest(), lr)
47+
props = {'cv__groups': my_groups,
48+
'estimator__logisticregressioncv__cv__groups': my_groups,
49+
'estimator__logisticregressioncv__sample_weight': my_weights,
50+
'scoring__sample_weight': my_weights,
51+
'estimator__scoring__sample_weight': my_weights}
52+
cross_validate(pipe, X, y, cv=group_cv,
53+
props=props,
54+
scoring='accuracy')
55+
56+
# %%
57+
# Case D: different scoring and fitting weights
58+
59+
lr = LogisticRegressionCV(
60+
cv=group_cv,
61+
scoring='accuracy',
62+
)
63+
props = {'cv__groups': my_groups,
64+
'estimator__cv__groups': my_groups,
65+
'estimator__sample_weight': my_other_weights,
66+
'scoring__sample_weight': my_weights,
67+
'estimator__scoring__sample_weight': my_weights}
68+
cross_validate(lr, X, y, cv=group_cv,
69+
props=props,
70+
scoring='accuracy')

slep006/cases_opt3.py

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
from defs import (accuracy, make_scorer, SelectKBest, LogisticRegressionCV,
2+
group_cv, cross_validate, make_pipeline, X, y, my_groups,
3+
my_weights, my_other_weights)
4+
5+
# %%
6+
# Case A: weighted scoring and fitting
7+
8+
lr = LogisticRegressionCV(
9+
cv=group_cv,
10+
scoring='accuracy',
11+
prop_routing={'cv': ['groups'],
12+
'scoring': ['sample_weight'],
13+
}
14+
# one question here is whether we need to explicitly route sample_weight
15+
# to LogisticRegressionCV's fitting...
16+
)
17+
18+
# Alternative syntax, which assumes cv receives 'groups' by default, and that a
19+
# method-based API is provided on meta-estimators:
20+
# lr = LogisticRegressionCV(
21+
# cv=group_cv,
22+
# scoring='accuracy',
23+
# ).add_prop_route(scoring='sample_weight')
24+
25+
cross_validate(lr, X, y, cv=group_cv,
26+
props={'sample_weight': my_weights, 'groups': my_groups},
27+
scoring='accuracy',
28+
prop_routing={'estimator': '*', # pass all props
29+
'cv': ['groups'],
30+
'scoring': ['sample_weight'],
31+
})
32+
33+
# Error handling: if props={'sample_eight': my_weights, ...} was passed
34+
# instead, LogisticRegressionCV would have to identify that a key was passed
35+
# that could not be routed nor used, in order to raise an error.
36+
37+
# %%
38+
# Case B: weighted scoring and unweighted fitting
39+
40+
# Here we rename the sample_weight prop so that we can specify that it only
41+
# applies to scoring.
42+
lr = LogisticRegressionCV(
43+
cv=group_cv,
44+
scoring='accuracy',
45+
prop_routing={'cv': ['groups'],
46+
# read the following as "scoring should consume
47+
# 'scoring_weight' as if it were 'sample_weight'."
48+
'scoring': {'sample_weight': 'scoring_weight'},
49+
},
50+
)
51+
cross_validate(lr, X, y, cv=group_cv,
52+
props={'scoring_weight': my_weights, 'groups': my_groups},
53+
scoring='accuracy',
54+
prop_routing={'estimator': '*',
55+
'cv': ['groups'],
56+
'scoring': {'sample_weight': 'scoring_weight'},
57+
})
58+
59+
# %%
60+
# Case C: unweighted feature selection
61+
62+
lr = LogisticRegressionCV(
63+
cv=group_cv,
64+
scoring='accuracy',
65+
prop_routing={'cv': ['groups'],
66+
'scoring': ['sample_weight'],
67+
})
68+
pipe = make_pipeline(SelectKBest(), lr,
69+
prop_routing={'logisticregressioncv': ['sample_weight',
70+
'groups']})
71+
cross_validate(lr, X, y, cv=group_cv,
72+
props={'sample_weight': my_weights, 'groups': my_groups},
73+
scoring='accuracy',
74+
prop_routing={'estimator': '*',
75+
'cv': ['groups'],
76+
'scoring': ['sample_weight'],
77+
})
78+
79+
# %%
80+
# Case D: different scoring and fitting weights
81+
lr = LogisticRegressionCV(
82+
cv=group_cv,
83+
scoring='accuracy',
84+
prop_routing={'cv': ['groups'],
85+
# read the following as "scoring should consume
86+
# 'scoring_weight' as if it were 'sample_weight'."
87+
'scoring': {'sample_weight': 'scoring_weight'},
88+
},
89+
)
90+
cross_validate(lr, X, y, cv=group_cv,
91+
props={'scoring_weight': my_weights, 'groups': my_groups,
92+
'fitting_weight': my_other_weights},
93+
scoring='accuracy',
94+
prop_routing={'estimator': {'sample_weight': 'fitting_weight',
95+
'scoring_weight': 'scoring_weight',
96+
'groups': 'groups'},
97+
'cv': ['groups'],
98+
'scoring': {'sample_weight': 'scoring_weight'},
99+
})

slep006/cases_opt4.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
from defs import (accuracy, group_cv, make_scorer, SelectKBest,
2+
LogisticRegressionCV, cross_validate,
3+
make_pipeline, X, y, my_groups, my_weights,
4+
my_other_weights)
5+
6+
# %%
7+
# Case A: weighted scoring and fitting
8+
9+
# Here we presume that GroupKFold requests `groups` by default.
10+
# We need to explicitly request weights in make_scorer and for
11+
# LogisticRegressionCV. Both of these consumers understand the meaning
12+
# of the key "sample_weight".
13+
14+
weighted_acc = make_scorer(accuracy, request_props=['sample_weight'])
15+
lr = LogisticRegressionCV(
16+
cv=group_cv,
17+
scoring=weighted_acc,
18+
).set_props_request(['sample_weight'])
19+
cross_validate(lr, X, y, cv=group_cv,
20+
props={'sample_weight': my_weights, 'groups': my_groups},
21+
scoring=weighted_acc)
22+
23+
# Error handling: if props={'sample_eight': my_weights, ...} was passed,
24+
# cross_validate would raise an error, since 'sample_eight' was not requested
25+
# by any of its children.
26+
27+
# %%
28+
# Case B: weighted scoring and unweighted fitting
29+
30+
# Since LogisticRegressionCV requires that weights explicitly be requested,
31+
# removing that request means the fitting is unweighted.
32+
33+
weighted_acc = make_scorer(accuracy, request_props=['sample_weight'])
34+
lr = LogisticRegressionCV(
35+
cv=group_cv,
36+
scoring=weighted_acc,
37+
)
38+
cross_validate(lr, X, y, cv=group_cv,
39+
props={'sample_weight': my_weights, 'groups': my_groups},
40+
scoring=weighted_acc)
41+
42+
# %%
43+
# Case C: unweighted feature selection
44+
45+
# Like LogisticRegressionCV, SelectKBest needs to request weights explicitly.
46+
# Here it does not request them.
47+
48+
weighted_acc = make_scorer(accuracy, request_props=['sample_weight'])
49+
lr = LogisticRegressionCV(
50+
cv=group_cv,
51+
scoring=weighted_acc,
52+
).set_props_request(['sample_weight'])
53+
sel = SelectKBest()
54+
pipe = make_pipeline(sel, lr)
55+
cross_validate(pipe, X, y, cv=group_cv,
56+
props={'sample_weight': my_weights, 'groups': my_groups},
57+
scoring=weighted_acc)
58+
59+
# %%
60+
# Case D: different scoring and fitting weights
61+
62+
# Despite make_scorer and LogisticRegressionCV both expecting a key
63+
# sample_weight, we can use aliases to pass different weights to different
64+
# consumers.
65+
66+
weighted_acc = make_scorer(accuracy,
67+
request_props={'scoring_weight': 'sample_weight'})
68+
lr = LogisticRegressionCV(
69+
cv=group_cv,
70+
scoring=weighted_acc,
71+
).set_props_request({'fitting_weight': "sample_weight"})
72+
cross_validate(lr, X, y, cv=group_cv,
73+
props={
74+
'scoring_weight': my_weights,
75+
'fitting_weight': my_other_weights,
76+
'groups': my_groups,
77+
},
78+
scoring=weighted_acc)

slep006/defs.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
import numpy as np
2+
from sklearn.feature_selection import SelectKBest
3+
from sklearn.linear_model import LogisticRegressionCV
4+
from sklearn.metrics import accuracy
5+
from sklearn.metrics import make_scorer
6+
from sklearn.model_selection import GroupKFold, cross_validate
7+
from sklearn.pipeline import make_pipeline
8+
9+
N, M = 100, 4
10+
X = np.random.rand(N, M)
11+
y = np.random.randint(0, 1, size=N)
12+
my_groups = np.random.randint(0, 10, size=N)
13+
my_weights = np.random.rand(N)
14+
my_other_weights = np.random.rand(N)

0 commit comments

Comments
 (0)