Skip to content

Return google symptoms to using current pandas version, with mitigations #1497

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Oct 17, 2022
16 changes: 11 additions & 5 deletions google_symptoms/delphi_google_symptoms/geo.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def generate_transition_matrix(geo_res):
if geo_res == "hrr":
map_df["population"] = map_df["population"] * map_df["weight"]

aggregated_pop = map_df.groupby(geo_res).sum().reset_index()
aggregated_pop = map_df.groupby(geo_res).sum(numeric_only=True).reset_index()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wondering about the numeric_only=True. In the function arg type hint, it says there's no default, but in the docs below that it says it defaults to bool=True. Guessing that it actually tries to use NANs by default?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I changed this because I'm getting a warning about to-be-deprecated behavior. Thought I'd just throw it in here.

FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. 
In a future version, numeric_only will default to False. Either specify numeric_only or select 
only columns which should be valid for the function.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The current default does actually seem to be numeric_only=True

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah thanks for that! Curious why they're moving away from numeric_only=True.

map_df = map_df.merge(
aggregated_pop, on=geo_res, how="inner", suffixes=["_raw", "_groupsum"]
)
Expand Down Expand Up @@ -79,8 +79,11 @@ def geo_map(df, geo_res, namescols = None):
return df

map_df = generate_transition_matrix(geo_res)
converted_df = pd.DataFrame(columns = df.columns)
for _date in df["timestamp"].unique():

dates_list = df["timestamp"].unique()
dfs_list = [pd.DataFrame()] * len(dates_list)

for i, _date in enumerate(dates_list):
val_lists = df[df["timestamp"] == _date].merge(
map_df["geo_id"], how="right"
)[namescols].fillna(0)
Expand All @@ -92,5 +95,8 @@ def geo_map(df, geo_res, namescols = None):
newdf["geo_id"] = list(map_df.keys())[1:]
mask = (newdf == 0)
newdf[mask] = np.nan
converted_df = converted_df.append(newdf)
return converted_df
dfs_list[i] = newdf

# Reindex to make sure output has same columns as input df. Filled with
# NaN values if column doesn't already exist.
return pd.concat(dfs_list).reindex(df.columns, axis=1)
3 changes: 2 additions & 1 deletion google_symptoms/delphi_google_symptoms/pull.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ def preprocess(df, level):
index_df = pd.MultiIndex.from_product(
[geo_list, date_list], names=['geo_id', 'date']
)
df.date = pd.to_datetime(df.date)
df = df.set_index(
["geo_id", "date"]
).reindex(
Expand Down Expand Up @@ -296,7 +297,7 @@ def pull_gs_data(credentials, export_start_date, export_end_date, num_export_day
df_dc_county = dfs["state"][dfs["state"]["geo_id"] == "dc"].drop(
"geo_id", axis=1)
df_dc_county["geo_id"] = DC_FIPS
dfs["county"] = dfs["county"].append(df_dc_county)
dfs["county"] = pd.concat([dfs["county"], df_dc_county])
except KeyError:
pass

Expand Down
5 changes: 3 additions & 2 deletions google_symptoms/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@
required = [
"mock",
"numpy",
"pandas==1.3.5",
"pandas",
"pydocstyle",
"pytest",
"pytest-cov",
"pylint==2.8.3",
"delphi-utils",
"freezegun",
"pandas-gbq"
"pandas-gbq",
"db-dtypes"
]

setup(
Expand Down
8 changes: 8 additions & 0 deletions google_symptoms/tests/test_geo.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def test_hrr(self):
).drop("weight", axis="columns")
hrr_pop = fips2hrr.groupby("hrr"
).sum(
numeric_only=True
).reset_index(
).rename(columns={"population": "hrr_pop"})
df_plus = df.merge(fips2hrr, left_on="geo_id", right_on="fips", how="left"
Expand All @@ -59,6 +60,7 @@ def test_hrr(self):
combined_metric = lambda x: x.metric_0/3 + x.metric_1/3 + x.metric_2/3
).groupby("hrr"
).sum(
numeric_only=True
).drop(
labels=[METRICS[23], METRICS[24], METRICS[25], COMBINED_METRIC[4]],
axis="columns"
Expand Down Expand Up @@ -91,6 +93,7 @@ def test_msa(self):
fips2msa = gmpr.add_population_column(gmpr.get_crosswalk("fips", "msa"), "fips")
msa_pop = fips2msa.groupby("msa"
).sum(
numeric_only=True
).reset_index(
).rename(columns={"population": "msa_pop"})
df_plus = df.merge(fips2msa, left_on="geo_id", right_on="fips", how="left"
Expand All @@ -103,6 +106,7 @@ def test_msa(self):
combined_metric = lambda x: x.metric_0/3 + x.metric_1/3 + x.metric_2/3
).groupby("msa"
).sum(
numeric_only=True
).drop(
labels=[METRICS[23], METRICS[24], METRICS[25], COMBINED_METRIC[4]],
axis="columns"
Expand Down Expand Up @@ -136,6 +140,7 @@ def test_hhs(self):
state2hhs = gmpr.add_geocode(state2hhs, "state_code", "hhs")
hhs_pop = state2hhs.groupby("hhs"
).sum(
numeric_only=True
).reset_index(
).rename(columns={"population": "hhs_pop"})
df_plus = df.merge(state2hhs, left_on="geo_id", right_on="state_id", how="left"
Expand All @@ -148,6 +153,7 @@ def test_hhs(self):
combined_metric = lambda x: x.metric_0/3 + x.metric_1/3 + x.metric_2/3
).groupby("hhs"
).sum(
numeric_only=True
).drop(
labels=[METRICS[23], METRICS[24], METRICS[25], COMBINED_METRIC[4]],
axis="columns"
Expand Down Expand Up @@ -181,6 +187,7 @@ def test_nation(self):
state2nation = gmpr.add_geocode(state2nation, "state_code", "nation")
nation_pop = state2nation.groupby("nation"
).sum(
numeric_only=True
).reset_index(
).rename(columns={"population": "nation_pop"})
df_plus = df.merge(state2nation, left_on="geo_id", right_on="state_id", how="left"
Expand All @@ -193,6 +200,7 @@ def test_nation(self):
combined_metric = lambda x: x.metric_0/3 + x.metric_1/3 + x.metric_2/3
).groupby("nation"
).sum(
numeric_only=True
).drop(
labels=[METRICS[23], METRICS[24], METRICS[25], COMBINED_METRIC[4]],
axis="columns"
Expand Down
11 changes: 11 additions & 0 deletions google_symptoms/tests/test_pull.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pytest
import mock
import db_dtypes
from freezegun import freeze_time
from datetime import date, datetime
import pandas as pd
Expand Down Expand Up @@ -90,6 +91,16 @@ def test_invalid_fips(self):
with pytest.raises(AssertionError):
preprocess(df, "county")

def test_no_rows_nulled(self):
"""
Check that rows are not mysteriously nulled out. See
https://github.com/cmu-delphi/covidcast-indicators/pull/1496 for motivating issue.
"""
# Cast date field to `dbdate` to match dataframe dtypes as provided by the BigQuery fetch.
df = pd.read_csv(good_input["state"]).astype({"date": "dbdate"})
out = preprocess(df, "state")
assert df.shape[0] == out[~out.Cough.isna()].shape[0]


class TestPullHelperFuncs:
@freeze_time("2021-01-05")
Expand Down