diff --git a/google_symptoms/delphi_google_symptoms/geo.py b/google_symptoms/delphi_google_symptoms/geo.py index 913fd4b80..85ea24674 100644 --- a/google_symptoms/delphi_google_symptoms/geo.py +++ b/google_symptoms/delphi_google_symptoms/geo.py @@ -37,7 +37,7 @@ def generate_transition_matrix(geo_res): if geo_res == "hrr": map_df["population"] = map_df["population"] * map_df["weight"] - aggregated_pop = map_df.groupby(geo_res).sum().reset_index() + aggregated_pop = map_df.groupby(geo_res).sum(numeric_only=True).reset_index() map_df = map_df.merge( aggregated_pop, on=geo_res, how="inner", suffixes=["_raw", "_groupsum"] ) @@ -79,8 +79,11 @@ def geo_map(df, geo_res, namescols = None): return df map_df = generate_transition_matrix(geo_res) - converted_df = pd.DataFrame(columns = df.columns) - for _date in df["timestamp"].unique(): + + dates_list = df["timestamp"].unique() + dfs_list = [pd.DataFrame()] * len(dates_list) + + for i, _date in enumerate(dates_list): val_lists = df[df["timestamp"] == _date].merge( map_df["geo_id"], how="right" )[namescols].fillna(0) @@ -92,5 +95,8 @@ def geo_map(df, geo_res, namescols = None): newdf["geo_id"] = list(map_df.keys())[1:] mask = (newdf == 0) newdf[mask] = np.nan - converted_df = converted_df.append(newdf) - return converted_df + dfs_list[i] = newdf + + # Reindex to make sure output has same columns as input df. Filled with + # NaN values if column doesn't already exist. + return pd.concat(dfs_list).reindex(df.columns, axis=1) diff --git a/google_symptoms/delphi_google_symptoms/pull.py b/google_symptoms/delphi_google_symptoms/pull.py index 82247ee53..b339c962c 100644 --- a/google_symptoms/delphi_google_symptoms/pull.py +++ b/google_symptoms/delphi_google_symptoms/pull.py @@ -82,6 +82,7 @@ def preprocess(df, level): index_df = pd.MultiIndex.from_product( [geo_list, date_list], names=['geo_id', 'date'] ) + df.date = pd.to_datetime(df.date) df = df.set_index( ["geo_id", "date"] ).reindex( @@ -296,7 +297,7 @@ def pull_gs_data(credentials, export_start_date, export_end_date, num_export_day df_dc_county = dfs["state"][dfs["state"]["geo_id"] == "dc"].drop( "geo_id", axis=1) df_dc_county["geo_id"] = DC_FIPS - dfs["county"] = dfs["county"].append(df_dc_county) + dfs["county"] = pd.concat([dfs["county"], df_dc_county]) except KeyError: pass diff --git a/google_symptoms/setup.py b/google_symptoms/setup.py index 2285cf1cb..91af03e64 100644 --- a/google_symptoms/setup.py +++ b/google_symptoms/setup.py @@ -4,14 +4,15 @@ required = [ "mock", "numpy", - "pandas==1.3.5", + "pandas", "pydocstyle", "pytest", "pytest-cov", "pylint==2.8.3", "delphi-utils", "freezegun", - "pandas-gbq" + "pandas-gbq", + "db-dtypes" ] setup( diff --git a/google_symptoms/tests/test_geo.py b/google_symptoms/tests/test_geo.py index 9ed6fef3d..a2bfd88ad 100644 --- a/google_symptoms/tests/test_geo.py +++ b/google_symptoms/tests/test_geo.py @@ -47,6 +47,7 @@ def test_hrr(self): ).drop("weight", axis="columns") hrr_pop = fips2hrr.groupby("hrr" ).sum( + numeric_only=True ).reset_index( ).rename(columns={"population": "hrr_pop"}) df_plus = df.merge(fips2hrr, left_on="geo_id", right_on="fips", how="left" @@ -59,6 +60,7 @@ def test_hrr(self): combined_metric = lambda x: x.metric_0/3 + x.metric_1/3 + x.metric_2/3 ).groupby("hrr" ).sum( + numeric_only=True ).drop( labels=[METRICS[23], METRICS[24], METRICS[25], COMBINED_METRIC[4]], axis="columns" @@ -91,6 +93,7 @@ def test_msa(self): fips2msa = gmpr.add_population_column(gmpr.get_crosswalk("fips", "msa"), "fips") msa_pop = fips2msa.groupby("msa" ).sum( + numeric_only=True ).reset_index( ).rename(columns={"population": "msa_pop"}) df_plus = df.merge(fips2msa, left_on="geo_id", right_on="fips", how="left" @@ -103,6 +106,7 @@ def test_msa(self): combined_metric = lambda x: x.metric_0/3 + x.metric_1/3 + x.metric_2/3 ).groupby("msa" ).sum( + numeric_only=True ).drop( labels=[METRICS[23], METRICS[24], METRICS[25], COMBINED_METRIC[4]], axis="columns" @@ -136,6 +140,7 @@ def test_hhs(self): state2hhs = gmpr.add_geocode(state2hhs, "state_code", "hhs") hhs_pop = state2hhs.groupby("hhs" ).sum( + numeric_only=True ).reset_index( ).rename(columns={"population": "hhs_pop"}) df_plus = df.merge(state2hhs, left_on="geo_id", right_on="state_id", how="left" @@ -148,6 +153,7 @@ def test_hhs(self): combined_metric = lambda x: x.metric_0/3 + x.metric_1/3 + x.metric_2/3 ).groupby("hhs" ).sum( + numeric_only=True ).drop( labels=[METRICS[23], METRICS[24], METRICS[25], COMBINED_METRIC[4]], axis="columns" @@ -181,6 +187,7 @@ def test_nation(self): state2nation = gmpr.add_geocode(state2nation, "state_code", "nation") nation_pop = state2nation.groupby("nation" ).sum( + numeric_only=True ).reset_index( ).rename(columns={"population": "nation_pop"}) df_plus = df.merge(state2nation, left_on="geo_id", right_on="state_id", how="left" @@ -193,6 +200,7 @@ def test_nation(self): combined_metric = lambda x: x.metric_0/3 + x.metric_1/3 + x.metric_2/3 ).groupby("nation" ).sum( + numeric_only=True ).drop( labels=[METRICS[23], METRICS[24], METRICS[25], COMBINED_METRIC[4]], axis="columns" diff --git a/google_symptoms/tests/test_pull.py b/google_symptoms/tests/test_pull.py index 3a47f4ef1..eef454884 100644 --- a/google_symptoms/tests/test_pull.py +++ b/google_symptoms/tests/test_pull.py @@ -1,5 +1,6 @@ import pytest import mock +import db_dtypes from freezegun import freeze_time from datetime import date, datetime import pandas as pd @@ -90,6 +91,16 @@ def test_invalid_fips(self): with pytest.raises(AssertionError): preprocess(df, "county") + def test_no_rows_nulled(self): + """ + Check that rows are not mysteriously nulled out. See + https://github.com/cmu-delphi/covidcast-indicators/pull/1496 for motivating issue. + """ + # Cast date field to `dbdate` to match dataframe dtypes as provided by the BigQuery fetch. + df = pd.read_csv(good_input["state"]).astype({"date": "dbdate"}) + out = preprocess(df, "state") + assert df.shape[0] == out[~out.Cough.isna()].shape[0] + class TestPullHelperFuncs: @freeze_time("2021-01-05")