From bf5137a41f646e1f2ec54d07686965b9fb5dc243 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Thu, 18 Jan 2024 10:44:46 -0800 Subject: [PATCH 1/5] ensuring boolean columns get cast to float for statsmodels --- ISLP/models/model_spec.py | 12 ++++++++---- tests/models/test_boolean_columns.py | 23 +++++++++++++++++++++++ 2 files changed, 31 insertions(+), 4 deletions(-) create mode 100644 tests/models/test_boolean_columns.py diff --git a/ISLP/models/model_spec.py b/ISLP/models/model_spec.py index 07a8b88..983a85d 100644 --- a/ISLP/models/model_spec.py +++ b/ISLP/models/model_spec.py @@ -107,7 +107,6 @@ def fit(self, X, y=None): cats = self.encoder_.categories_[0] column_names = [str(n) for n in cats] - if isinstance(X, pd.DataFrame): # expecting a column, we take .iloc[:,0] X = X.iloc[:,0] @@ -635,18 +634,23 @@ def build_model(column_info, if isinstance(X, (pd.Series, pd.DataFrame)): df = pd.concat(dfs, axis=1) df.index = X.index - return df else: - return np.column_stack(dfs) + return np.column_stack(dfs).astype(float) else: # return a 0 design zero = np.zeros(X.shape[0]) if isinstance(X, (pd.Series, pd.DataFrame)): df = pd.DataFrame({'zero': zero}) df.index = X.index - return df else: return zero + # if we reach here, we will be returning a DataFrame + + for col in df.columns: + if df[col].dtype == bool: + df[col] = df[col].astype(float) + return df + def derived_feature(variables, encoder=None, name=None, use_transform=True): """ Create a Feature, optionally diff --git a/tests/models/test_boolean_columns.py b/tests/models/test_boolean_columns.py new file mode 100644 index 0000000..7b5a429 --- /dev/null +++ b/tests/models/test_boolean_columns.py @@ -0,0 +1,23 @@ +import pandas as pd +import statsmodels.api as sm +import numpy as np +from itertools import combinations + +from ISLP.models import ModelSpec as MS + +rng = np.random.default_rng(0) + +df = pd.DataFrame({'A':rng.standard_normal(10), + 'B':np.array([1,2,3,2,1,1,1,3,2,1], int), + 'C':np.array([True,False,False,True,True]*2, bool), + 'D':rng.standard_normal(10)}) +Y = rng.standard_normal(10) + +def test_all(): + + for i in range(1, 5): + for comb in combinations(['A','B','C','D'], i): + + X = MS(comb).fit_transform(df) + sm.OLS(Y, X).fit() + From 88316a6a1a0a3660ff0be308ca46d643e5e8da9f Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Thu, 18 Jan 2024 10:46:02 -0800 Subject: [PATCH 2/5] editing setup files, cleanup --- ISLP/info.py | 132 ++++++++++++++++++++++--------------------------- pyproject.toml | 6 ++- setup.cfg | 3 ++ setup.py | 36 ++++++-------- 4 files changed, 81 insertions(+), 96 deletions(-) diff --git a/ISLP/info.py b/ISLP/info.py index 870c1e9..df1505c 100644 --- a/ISLP/info.py +++ b/ISLP/info.py @@ -3,81 +3,67 @@ In setup.py in particular, we exec this file, so it cannot import regreg """ -# regreg version information. An empty _version_extra corresponds to a -# full release. '.dev' as a _version_extra string means this is a development -# version -_version_major = 0 -_version_minor = 2 -_version_micro = 0 -_version_extra = '' +# CLASSIFIERS = ["Development Status :: 3 - Alpha", +# "Environment :: Console", +# "Intended Audience :: Science/Research", +# "License :: OSI Approved :: BSD License", +# "Operating System :: OS Independent", +# "Programming Language :: Python", +# "Topic :: Scientific/Engineering"] -# Format expected by setup.py and doc/source/conf.py: string of form "X.Y.Z" -__version__ = "%s.%s.%s%s" % (_version_major, - _version_minor, - _version_micro, - _version_extra) +# description = 'Library for ISLP labs' -CLASSIFIERS = ["Development Status :: 3 - Alpha", - "Environment :: Console", - "Intended Audience :: Science/Research", - "License :: OSI Approved :: BSD License", - "Operating System :: OS Independent", - "Programming Language :: Python", - "Topic :: Scientific/Engineering"] +# # # Note: this long_description is actually a copy/paste from the top-level +# # # README.txt, so that it shows up nicely on PyPI. So please remember to edit +# # # it only in one place and sync it correctly. +# # long_description = \ +# # """ +# # ============ +# # Fixed lambda +# # ============ -description = 'Library for ISLP labs' +# # This mini-package contains a module to perform +# # a fixed lambda test for the LASSO. +# # """ -# Note: this long_description is actually a copy/paste from the top-level -# README.txt, so that it shows up nicely on PyPI. So please remember to edit -# it only in one place and sync it correctly. -long_description = \ -""" -============ -Fixed lambda -============ - -This mini-package contains a module to perform -a fixed lambda test for the LASSO. -""" - -# versions -NUMPY_MIN_VERSION='1.7.1' -SCIPY_MIN_VERSION = '0.9' -PANDAS_MIN_VERSION = "0.20" -PANDAS_MAX_VERSION = "1.9" -SKLEARN_MIN_VERSION = '1.2' -STATSMODELS_MIN_VERSION = '0.13' -MATPLOTLIB_MIN_VERSION = '3.3.3' +# # # versions +# # NUMPY_MIN_VERSION='1.7.1' +# # SCIPY_MIN_VERSION = '0.9' +# # PANDAS_MIN_VERSION = "0.20" +# # PANDAS_MAX_VERSION = "1.9" +# # SKLEARN_MIN_VERSION = '1.2' +# # STATSMODELS_MIN_VERSION = '0.13' +# # MATPLOTLIB_MIN_VERSION = '3.3.3' -NAME = 'ISLP' -MAINTAINER = "Jonathan Taylor" -MAINTAINER_EMAIL = "" -DESCRIPTION = description -LONG_DESCRIPTION = long_description -URL = "http://github.org/intro-stat-learning/ISLP" -DOWNLOAD_URL = "" -LICENSE = "BSD license" -CLASSIFIERS = CLASSIFIERS -AUTHOR = "ISLP authors" -AUTHOR_EMAIL = "" -PLATFORMS = "OS Independent" -MAJOR = _version_major -MINOR = _version_minor -MICRO = _version_micro -ISRELEASE = _version_extra == '' -VERSION = __version__ -STATUS = 'alpha' -PROVIDES = [] -REQUIRES = ["numpy (>=%s)" % NUMPY_MIN_VERSION, - "scipy (>=%s)" % SCIPY_MIN_VERSION, - "statsmodels (>=%s)" % STATSMODELS_MIN_VERSION, - "pandas (>=%s)" % PANDAS_MIN_VERSION, - "pandas (<=%s)" % PANDAS_MAX_VERSION, - "sklearn (>=%s)" % SKLEARN_MIN_VERSION, - "lifelines", - "joblib", - "pygam", - "torch", - "torchmetrics", - "pytorch_lightning" - ] +# NAME = 'ISLP' +# MAINTAINER = "Jonathan Taylor" +# MAINTAINER_EMAIL = "jonathan.taylor@stanford.edu" +# DESCRIPTION = description +# LONG_DESCRIPTION = long_description +# URL = "http://github.org/intro-stat-learning/ISLP" +# DOWNLOAD_URL = "https://pypi.org/project/ISLP/" +# LICENSE = "BSD license" +# CLASSIFIERS = CLASSIFIERS +# AUTHOR = "ISLP authors" +# AUTHOR_EMAIL = "" +# PLATFORMS = "OS Independent" +# MAJOR = _version_major +# MINOR = _version_minor +# MICRO = _version_micro +# ISRELEASE = _version_extra == '' +# VERSION = __version__ +# STATUS = 'alpha' +# PROVIDES = [] +# REQUIRES = ["numpy (>=%s)" % NUMPY_MIN_VERSION, +# "scipy (>=%s)" % SCIPY_MIN_VERSION, +# "statsmodels (>=%s)" % STATSMODELS_MIN_VERSION, +# "pandas (>=%s)" % PANDAS_MIN_VERSION, +# "pandas (<=%s)" % PANDAS_MAX_VERSION, +# "sklearn (>=%s)" % SKLEARN_MIN_VERSION, +# "lifelines", +# "joblib", +# "pygam", +# "torch", +# "torchmetrics", +# "pytorch_lightning" +# ] diff --git a/pyproject.toml b/pyproject.toml index b94fdf7..e0359ad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ISLP" -version = "0.3.21" +version = "0.3.22" dependencies = ["numpy>=1.7.1,<1.25", # max version for numba "scipy>=0.9", "pandas>=0.20,<=1.9", @@ -37,6 +37,7 @@ classifiers = ["Development Status :: 3 - Alpha", "Programming Language :: Python", "Topic :: Scientific/Engineering" ] + [project.urls] # Optional "Homepage" = "https://github.com/intro-stat-learning/ISLP" "Bug Reports" = "https://github.com/intro-stat-learning/ISLP/issues" @@ -44,6 +45,9 @@ classifiers = ["Development Status :: 3 - Alpha", "Say Thanks!" = "http://saythanks.io/to/example" "Source" = "https://github.com/pypa/sampleproject/" +[project.optional-dependencies] +doc = ['Sphinx>=3.0'] + [build-system] requires = ["setuptools>=42", "wheel", diff --git a/setup.cfg b/setup.cfg index 14d7ccd..4a6f58e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -4,3 +4,6 @@ style = pep440 versionfile_source = ISLP/_version.py tag_prefix = parentdir_prefix = ISLP- + +[metadata] +license_files = LICENSE.txt \ No newline at end of file diff --git a/setup.py b/setup.py index c30a6d7..19a91a3 100755 --- a/setup.py +++ b/setup.py @@ -55,14 +55,6 @@ def read_vars_from(ver_file): # is missing. Otherwise the monkeypatched Extension will change .pyx # filenames to .c filenames, and we probably don't have the .c files. sys.path.insert(0, pjoin(dirname(__file__), 'fake_pyrex')) -# Set setuptools extra arguments -extra_setuptools_args = dict( - tests_require=['nose'], - test_suite='nose.collector', - zip_safe=False, - extras_require = dict( - doc=['Sphinx>=1.0'], - test=['nose>=0.10.1'])) # Define extensions EXTS = [] @@ -74,20 +66,20 @@ def read_vars_from(ver_file): long_description = open('README.md', 'rt', encoding='utf-8').read() def main(**extra_args): - setup(name=info.NAME, - maintainer=info.MAINTAINER, - maintainer_email=info.MAINTAINER_EMAIL, - description=info.DESCRIPTION, - url=info.URL, - download_url=info.DOWNLOAD_URL, - license=info.LICENSE, - classifiers=info.CLASSIFIERS, - author=info.AUTHOR, - author_email=info.AUTHOR_EMAIL, - platforms=info.PLATFORMS, + setup(#name=info.NAME, + #maintainer=info.MAINTAINER, + #maintainer_email=info.MAINTAINER_EMAIL, + #description=info.DESCRIPTION, + #url=info.URL, + #download_url=info.DOWNLOAD_URL, + #license=info.LICENSE, + #classifiers=info.CLASSIFIERS, + #author=info.AUTHOR, + #author_email=info.AUTHOR_EMAIL, + #platforms=info.PLATFORMS, version=versioneer.get_version(), - requires=info.REQUIRES, - provides=info.PROVIDES, + #requires=info.REQUIRES, + #provides=info.PROVIDES, packages = ['ISLP', 'ISLP.models', 'ISLP.models', @@ -108,4 +100,4 @@ def main(**extra_args): #simple way to test what setup will do #python setup.py install --prefix=/tmp if __name__ == "__main__": - main(**extra_setuptools_args) + main() From 4366af94681a84db7e26d1670462ed202d8ddd3a Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Thu, 18 Jan 2024 11:06:20 -0800 Subject: [PATCH 3/5] commenting out info --- setup.py | 85 ++++++++++++++++++++++++-------------------------------- 1 file changed, 36 insertions(+), 49 deletions(-) diff --git a/setup.py b/setup.py index 19a91a3..8fba84c 100755 --- a/setup.py +++ b/setup.py @@ -20,41 +20,41 @@ from distutils.core import setup from distutils.extension import Extension -# Get various parameters for this version, stored in ISLP/info.py - -class Bunch(object): - def __init__(self, vars): - for key, name in vars.items(): - if key.startswith('__'): - continue - self.__dict__[key] = name - -def read_vars_from(ver_file): - """ Read variables from Python text file - - Parameters - ---------- - ver_file : str - Filename of file to read - - Returns - ------- - info_vars : Bunch instance - Bunch object where variables read from `ver_file` appear as - attributes - """ - # Use exec for compabibility with Python 3 - ns = {} - with open(ver_file, 'rt') as fobj: - exec(fobj.read(), ns) - return Bunch(ns) - -info = read_vars_from(pjoin('ISLP', 'info.py')) - -# Try to preempt setuptools monkeypatching of Extension handling when Pyrex -# is missing. Otherwise the monkeypatched Extension will change .pyx -# filenames to .c filenames, and we probably don't have the .c files. -sys.path.insert(0, pjoin(dirname(__file__), 'fake_pyrex')) +# # Get various parameters for this version, stored in ISLP/info.py + +# class Bunch(object): +# def __init__(self, vars): +# for key, name in vars.items(): +# if key.startswith('__'): +# continue +# self.__dict__[key] = name + +# def read_vars_from(ver_file): +# """ Read variables from Python text file + +# Parameters +# ---------- +# ver_file : str +# Filename of file to read + +# Returns +# ------- +# info_vars : Bunch instance +# Bunch object where variables read from `ver_file` appear as +# attributes +# """ +# # Use exec for compabibility with Python 3 +# ns = {} +# with open(ver_file, 'rt') as fobj: +# exec(fobj.read(), ns) +# return Bunch(ns) + +# info = read_vars_from(pjoin('ISLP', 'info.py')) + +# # Try to preempt setuptools monkeypatching of Extension handling when Pyrex +# # is missing. Otherwise the monkeypatched Extension will change .pyx +# # filenames to .c filenames, and we probably don't have the .c files. +# sys.path.insert(0, pjoin(dirname(__file__), 'fake_pyrex')) # Define extensions EXTS = [] @@ -66,20 +66,7 @@ def read_vars_from(ver_file): long_description = open('README.md', 'rt', encoding='utf-8').read() def main(**extra_args): - setup(#name=info.NAME, - #maintainer=info.MAINTAINER, - #maintainer_email=info.MAINTAINER_EMAIL, - #description=info.DESCRIPTION, - #url=info.URL, - #download_url=info.DOWNLOAD_URL, - #license=info.LICENSE, - #classifiers=info.CLASSIFIERS, - #author=info.AUTHOR, - #author_email=info.AUTHOR_EMAIL, - #platforms=info.PLATFORMS, - version=versioneer.get_version(), - #requires=info.REQUIRES, - #provides=info.PROVIDES, + setup(version=versioneer.get_version(), packages = ['ISLP', 'ISLP.models', 'ISLP.models', From 4d5eb34ce18f7b811a2e092d13ce51af2a399164 Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Thu, 18 Jan 2024 11:07:49 -0800 Subject: [PATCH 4/5] cleanup of setup.py --- ISLP/info.py | 69 ---------------------------------------------------- setup.py | 37 +--------------------------- 2 files changed, 1 insertion(+), 105 deletions(-) delete mode 100644 ISLP/info.py diff --git a/ISLP/info.py b/ISLP/info.py deleted file mode 100644 index df1505c..0000000 --- a/ISLP/info.py +++ /dev/null @@ -1,69 +0,0 @@ -""" This file contains defines parameters for regreg that we use to fill -settings in setup.py, the regreg top-level docstring, and for building the docs. -In setup.py in particular, we exec this file, so it cannot import regreg -""" - -# CLASSIFIERS = ["Development Status :: 3 - Alpha", -# "Environment :: Console", -# "Intended Audience :: Science/Research", -# "License :: OSI Approved :: BSD License", -# "Operating System :: OS Independent", -# "Programming Language :: Python", -# "Topic :: Scientific/Engineering"] - -# description = 'Library for ISLP labs' - -# # # Note: this long_description is actually a copy/paste from the top-level -# # # README.txt, so that it shows up nicely on PyPI. So please remember to edit -# # # it only in one place and sync it correctly. -# # long_description = \ -# # """ -# # ============ -# # Fixed lambda -# # ============ - -# # This mini-package contains a module to perform -# # a fixed lambda test for the LASSO. -# # """ - -# # # versions -# # NUMPY_MIN_VERSION='1.7.1' -# # SCIPY_MIN_VERSION = '0.9' -# # PANDAS_MIN_VERSION = "0.20" -# # PANDAS_MAX_VERSION = "1.9" -# # SKLEARN_MIN_VERSION = '1.2' -# # STATSMODELS_MIN_VERSION = '0.13' -# # MATPLOTLIB_MIN_VERSION = '3.3.3' - -# NAME = 'ISLP' -# MAINTAINER = "Jonathan Taylor" -# MAINTAINER_EMAIL = "jonathan.taylor@stanford.edu" -# DESCRIPTION = description -# LONG_DESCRIPTION = long_description -# URL = "http://github.org/intro-stat-learning/ISLP" -# DOWNLOAD_URL = "https://pypi.org/project/ISLP/" -# LICENSE = "BSD license" -# CLASSIFIERS = CLASSIFIERS -# AUTHOR = "ISLP authors" -# AUTHOR_EMAIL = "" -# PLATFORMS = "OS Independent" -# MAJOR = _version_major -# MINOR = _version_minor -# MICRO = _version_micro -# ISRELEASE = _version_extra == '' -# VERSION = __version__ -# STATUS = 'alpha' -# PROVIDES = [] -# REQUIRES = ["numpy (>=%s)" % NUMPY_MIN_VERSION, -# "scipy (>=%s)" % SCIPY_MIN_VERSION, -# "statsmodels (>=%s)" % STATSMODELS_MIN_VERSION, -# "pandas (>=%s)" % PANDAS_MIN_VERSION, -# "pandas (<=%s)" % PANDAS_MAX_VERSION, -# "sklearn (>=%s)" % SKLEARN_MIN_VERSION, -# "lifelines", -# "joblib", -# "pygam", -# "torch", -# "torchmetrics", -# "pytorch_lightning" -# ] diff --git a/setup.py b/setup.py index 8fba84c..4834240 100755 --- a/setup.py +++ b/setup.py @@ -20,46 +20,11 @@ from distutils.core import setup from distutils.extension import Extension -# # Get various parameters for this version, stored in ISLP/info.py - -# class Bunch(object): -# def __init__(self, vars): -# for key, name in vars.items(): -# if key.startswith('__'): -# continue -# self.__dict__[key] = name - -# def read_vars_from(ver_file): -# """ Read variables from Python text file - -# Parameters -# ---------- -# ver_file : str -# Filename of file to read - -# Returns -# ------- -# info_vars : Bunch instance -# Bunch object where variables read from `ver_file` appear as -# attributes -# """ -# # Use exec for compabibility with Python 3 -# ns = {} -# with open(ver_file, 'rt') as fobj: -# exec(fobj.read(), ns) -# return Bunch(ns) - -# info = read_vars_from(pjoin('ISLP', 'info.py')) - -# # Try to preempt setuptools monkeypatching of Extension handling when Pyrex -# # is missing. Otherwise the monkeypatched Extension will change .pyx -# # filenames to .c filenames, and we probably don't have the .c files. -# sys.path.insert(0, pjoin(dirname(__file__), 'fake_pyrex')) # Define extensions EXTS = [] -cmdclass=versioneer.get_cmdclass() +cmdclass = versioneer.get_cmdclass() # get long_description From 6f718c89883c89357b60e1032fa8cc00817fafbe Mon Sep 17 00:00:00 2001 From: Jonathan Taylor Date: Thu, 18 Jan 2024 11:42:06 -0800 Subject: [PATCH 5/5] use iloc for float conversion in case there are duplicate columnes --- ISLP/models/model_spec.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/ISLP/models/model_spec.py b/ISLP/models/model_spec.py index 983a85d..ce09e01 100644 --- a/ISLP/models/model_spec.py +++ b/ISLP/models/model_spec.py @@ -632,7 +632,7 @@ def build_model(column_info, if len(dfs): if isinstance(X, (pd.Series, pd.DataFrame)): - df = pd.concat(dfs, axis=1) + df = pd.concat(dfs, axis='columns') df.index = X.index else: return np.column_stack(dfs).astype(float) @@ -645,10 +645,11 @@ def build_model(column_info, return zero # if we reach here, we will be returning a DataFrame - - for col in df.columns: - if df[col].dtype == bool: - df[col] = df[col].astype(float) + # make sure all columns are floats + + for i, _ in enumerate(df.columns): + if df.iloc[:,i].dtype == bool: + df.iloc[:,i] = df.iloc[:,i].astype(float) return df def derived_feature(variables, encoder=None, name=None, use_transform=True):