cmu-delphi · krivard · Oct 17, 2022 · Jan 26, 2022 · Jan 27, 2022 · Jan 27, 2022
diff --git a/google_symptoms/delphi_google_symptoms/geo.py b/google_symptoms/delphi_google_symptoms/geo.py
@@ -37,7 +37,7 @@ def generate_transition_matrix(geo_res):
     if geo_res == "hrr":
         map_df["population"] = map_df["population"] *  map_df["weight"]
 
-    aggregated_pop = map_df.groupby(geo_res).sum().reset_index()
+    aggregated_pop = map_df.groupby(geo_res).sum(numeric_only=True).reset_index()
     map_df = map_df.merge(
             aggregated_pop, on=geo_res, how="inner", suffixes=["_raw", "_groupsum"]
             )
@@ -79,8 +79,11 @@ def geo_map(df, geo_res, namescols =  None):
         return df
 
     map_df = generate_transition_matrix(geo_res)
-    converted_df = pd.DataFrame(columns = df.columns)
-    for _date in df["timestamp"].unique():
+
+    dates_list = df["timestamp"].unique()
+    dfs_list = [pd.DataFrame()] * len(dates_list)
+
+    for i, _date in enumerate(dates_list):
         val_lists = df[df["timestamp"] == _date].merge(
                 map_df["geo_id"], how="right"
                 )[namescols].fillna(0)
@@ -92,5 +95,8 @@ def geo_map(df, geo_res, namescols =  None):
         newdf["geo_id"] = list(map_df.keys())[1:]
         mask = (newdf == 0)
         newdf[mask] = np.nan
-        converted_df = converted_df.append(newdf)
-    return converted_df
+        dfs_list[i] = newdf
+
+    # Reindex to make sure output has same columns as input df. Filled with
+    # NaN values if column doesn't already exist.
+    return pd.concat(dfs_list).reindex(df.columns, axis=1)
diff --git a/google_symptoms/delphi_google_symptoms/pull.py b/google_symptoms/delphi_google_symptoms/pull.py
@@ -82,6 +82,7 @@ def preprocess(df, level):
         index_df = pd.MultiIndex.from_product(
             [geo_list, date_list], names=['geo_id', 'date']
         )
+        df.date = pd.to_datetime(df.date)
         df = df.set_index(
             ["geo_id", "date"]
         ).reindex(
@@ -296,7 +297,7 @@ def pull_gs_data(credentials, export_start_date, export_end_date, num_export_day
         df_dc_county = dfs["state"][dfs["state"]["geo_id"] == "dc"].drop(
             "geo_id", axis=1)
         df_dc_county["geo_id"] = DC_FIPS
-        dfs["county"] = dfs["county"].append(df_dc_county)
+        dfs["county"] = pd.concat([dfs["county"], df_dc_county])
     except KeyError:
         pass
 

diff --git a/google_symptoms/setup.py b/google_symptoms/setup.py
@@ -4,14 +4,15 @@
 required = [
     "mock",
     "numpy",
-    "pandas==1.3.5",
+    "pandas",
     "pydocstyle",
     "pytest",
     "pytest-cov",
     "pylint==2.8.3",
     "delphi-utils",
     "freezegun",
-    "pandas-gbq"
+    "pandas-gbq",
+    "db-dtypes"
 ]
 
 setup(

diff --git a/google_symptoms/tests/test_geo.py b/google_symptoms/tests/test_geo.py
@@ -47,6 +47,7 @@ def test_hrr(self):
             ).drop("weight", axis="columns")
         hrr_pop = fips2hrr.groupby("hrr"
             ).sum(
+                numeric_only=True
             ).reset_index(
             ).rename(columns={"population": "hrr_pop"})
         df_plus = df.merge(fips2hrr, left_on="geo_id", right_on="fips", how="left"
@@ -59,6 +60,7 @@ def test_hrr(self):
                 combined_metric = lambda x: x.metric_0/3 + x.metric_1/3 + x.metric_2/3
             ).groupby("hrr"
             ).sum(
+                numeric_only=True
             ).drop(
                 labels=[METRICS[23], METRICS[24], METRICS[25], COMBINED_METRIC[4]],
                 axis="columns"
@@ -91,6 +93,7 @@ def test_msa(self):
         fips2msa = gmpr.add_population_column(gmpr.get_crosswalk("fips", "msa"), "fips")
         msa_pop = fips2msa.groupby("msa"
             ).sum(
+                numeric_only=True
             ).reset_index(
             ).rename(columns={"population": "msa_pop"})
         df_plus = df.merge(fips2msa, left_on="geo_id", right_on="fips", how="left"
@@ -103,6 +106,7 @@ def test_msa(self):
                 combined_metric = lambda x: x.metric_0/3 + x.metric_1/3 + x.metric_2/3
             ).groupby("msa"
             ).sum(
+                numeric_only=True
             ).drop(
                 labels=[METRICS[23], METRICS[24], METRICS[25], COMBINED_METRIC[4]],
                 axis="columns"
@@ -136,6 +140,7 @@ def test_hhs(self):
         state2hhs = gmpr.add_geocode(state2hhs, "state_code", "hhs")
         hhs_pop = state2hhs.groupby("hhs"
             ).sum(
+                numeric_only=True
             ).reset_index(
             ).rename(columns={"population": "hhs_pop"})
         df_plus = df.merge(state2hhs, left_on="geo_id", right_on="state_id", how="left"
@@ -148,6 +153,7 @@ def test_hhs(self):
                 combined_metric = lambda x: x.metric_0/3 + x.metric_1/3 + x.metric_2/3
             ).groupby("hhs"
             ).sum(
+                numeric_only=True
             ).drop(
                 labels=[METRICS[23], METRICS[24], METRICS[25], COMBINED_METRIC[4]],
                 axis="columns"
@@ -181,6 +187,7 @@ def test_nation(self):
         state2nation = gmpr.add_geocode(state2nation, "state_code", "nation")
         nation_pop = state2nation.groupby("nation"
             ).sum(
+                numeric_only=True
             ).reset_index(
             ).rename(columns={"population": "nation_pop"})
         df_plus = df.merge(state2nation, left_on="geo_id", right_on="state_id", how="left"
@@ -193,6 +200,7 @@ def test_nation(self):
                 combined_metric = lambda x: x.metric_0/3 + x.metric_1/3 + x.metric_2/3
             ).groupby("nation"
             ).sum(
+                numeric_only=True
             ).drop(
                 labels=[METRICS[23], METRICS[24], METRICS[25], COMBINED_METRIC[4]],
                 axis="columns"

diff --git a/google_symptoms/tests/test_pull.py b/google_symptoms/tests/test_pull.py
@@ -1,5 +1,6 @@
 import pytest
 import mock
+import db_dtypes
 from freezegun import freeze_time
 from datetime import date, datetime
 import pandas as pd
@@ -90,6 +91,16 @@ def test_invalid_fips(self):
         with pytest.raises(AssertionError):
             preprocess(df, "county")
 
+    def test_no_rows_nulled(self):
+        """
+        Check that rows are not mysteriously nulled out. See
+        https://github.com/cmu-delphi/covidcast-indicators/pull/1496 for motivating issue.
+        """
+        # Cast date field to `dbdate` to match dataframe dtypes as provided by the BigQuery fetch.
+        df = pd.read_csv(good_input["state"]).astype({"date": "dbdate"})
+        out = preprocess(df, "state")
+        assert df.shape[0] == out[~out.Cough.isna()].shape[0]
+
 
 class TestPullHelperFuncs:
     @freeze_time("2021-01-05")