diff --git a/.gitignore b/.gitignore index 8c6107d1..b1ea75c2 100644 --- a/.gitignore +++ b/.gitignore @@ -109,3 +109,5 @@ ENV/ # OS Files .DS_Store +# vcode stuff +.vcode/ diff --git a/sdmetrics/single_table/__init__.py b/sdmetrics/single_table/__init__.py index 35704626..ab6c76c0 100644 --- a/sdmetrics/single_table/__init__.py +++ b/sdmetrics/single_table/__init__.py @@ -6,7 +6,8 @@ from sdmetrics.single_table.bayesian_network import BNLikelihood, BNLogLikelihood from sdmetrics.single_table.detection.base import DetectionMetric from sdmetrics.single_table.detection.sklearn import ( - LogisticDetection, ScikitLearnClassifierDetectionMetric, SVCDetection) + GradientBoostingDetection, LogisticDetection, ScikitLearnClassifierDetectionMetric, + SVCDetection) from sdmetrics.single_table.efficacy.base import MLEfficacyMetric from sdmetrics.single_table.efficacy.binary import ( BinaryAdaBoostClassifier, BinaryDecisionTreeClassifier, BinaryEfficacyMetric, @@ -47,6 +48,7 @@ 'DetectionMetric', 'LogisticDetection', 'SVCDetection', + 'GradientBoostingDetection', 'ScikitLearnClassifierDetectionMetric', 'MLEfficacyMetric', 'BinaryEfficacyMetric', diff --git a/sdmetrics/single_table/detection/__init__.py b/sdmetrics/single_table/detection/__init__.py index b987a119..8450948b 100644 --- a/sdmetrics/single_table/detection/__init__.py +++ b/sdmetrics/single_table/detection/__init__.py @@ -1,8 +1,10 @@ """Machine Learning Detection metrics for single table datasets.""" -from sdmetrics.single_table.detection.sklearn import LogisticDetection, SVCDetection +from sdmetrics.single_table.detection.sklearn import ( + GradientBoostingDetection, LogisticDetection, SVCDetection) __all__ = [ + 'GradientBoostingDetection', 'LogisticDetection', 'SVCDetection' ] diff --git a/sdmetrics/single_table/detection/sklearn.py b/sdmetrics/single_table/detection/sklearn.py index a33a33d9..38f1aa7e 100644 --- a/sdmetrics/single_table/detection/sklearn.py +++ b/sdmetrics/single_table/detection/sklearn.py @@ -1,5 +1,6 @@ """scikit-learn based DetectionMetrics for single table datasets.""" +from sklearn.ensemble import GradientBoostingClassifier from sklearn.impute import SimpleImputer from sklearn.linear_model import LogisticRegression from sklearn.pipeline import Pipeline @@ -67,3 +68,19 @@ class SVCDetection(ScikitLearnClassifierDetectionMetric): @staticmethod def _get_classifier(): return SVC(probability=True, gamma='scale') + + +class GradientBoostingDetection(ScikitLearnClassifierDetectionMetric): + """ScikitLearnClassifierDetectionMetric based on a GradientBoostingClassifier. + + This metric builds a GradientBoostingClassifier Classifier that learns to tell the synthetic + data apart from the real data, which later on is evaluated using Cross Validation. + + The output of the metric is one minus the average ROC AUC score obtained. + """ + + name = 'GradientBoosting Detection' + + @staticmethod + def _get_classifier(): + return GradientBoostingClassifier() diff --git a/tests/integration/single_table/test_single_table.py b/tests/integration/single_table/test_single_table.py index 7ecd45b3..2f880887 100644 --- a/tests/integration/single_table/test_single_table.py +++ b/tests/integration/single_table/test_single_table.py @@ -7,7 +7,8 @@ from sdmetrics.goal import Goal from sdmetrics.single_table.base import SingleTableMetric from sdmetrics.single_table.bayesian_network import BNLikelihood, BNLogLikelihood -from sdmetrics.single_table.detection import LogisticDetection, SVCDetection +from sdmetrics.single_table.detection import ( + GradientBoostingDetection, LogisticDetection, SVCDetection) from sdmetrics.single_table.multi_column_pairs import ( ContingencySimilarity, ContinuousKLDivergence, DiscreteKLDivergence) from sdmetrics.single_table.multi_single_column import ( @@ -17,6 +18,7 @@ METRICS = [ CSTest, KSComplement, + GradientBoostingDetection, LogisticDetection, SVCDetection, ContinuousKLDivergence,