Skip to content

Commit 419c7de

Browse files
authored
Merge pull request #1589 from cmu-delphi/revert-1586-release/indicators_v0.3.11_utils_v0.3.3
Revert "Release covidcast-indicators 0.3.11"
2 parents 3cb7411 + 18c23e8 commit 419c7de

File tree

6 files changed

+12
-204
lines changed

6 files changed

+12
-204
lines changed

.bumpversion.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[bumpversion]
2-
current_version = 0.3.11
2+
current_version = 0.3.10
33
commit = True
44
message = chore: bump covidcast-indicators to {new_version}
55
tag = False

ansible/templates/dsew_community_profile-params-prod.json.j2

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -39,14 +39,5 @@
3939
"booster_doses_admin_7dav"
4040
]
4141
}
42-
},
43-
"archive": {
44-
"aws_credentials": {
45-
"aws_access_key_id": "{{ delphi_aws_access_key_id }}",
46-
"aws_secret_access_key": "{{ delphi_aws_secret_access_key }}"
47-
},
48-
"bucket_name": "delphi-covidcast-indicator-output",
49-
"cache_dir": "./cache",
50-
"indicator_prefix": "delphi_dsew_community_profile"
5142
}
5243
}

dsew_community_profile/.gitignore

Lines changed: 0 additions & 1 deletion
This file was deleted.

dsew_community_profile/delphi_dsew_community_profile/constants.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@
66
DOWNLOAD_ATTACHMENT = URL_PREFIX + "/files/{assetId}?download=true&filename={filename}"
77
DOWNLOAD_LISTING = URL_PREFIX + ".json"
88

9-
INTERP_LENGTH = 5
10-
119
@dataclass
1210
class Transform:
1311
"""Transformation filters for interpreting a particular sheet in the workbook."""

dsew_community_profile/delphi_dsew_community_profile/pull.py

Lines changed: 7 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import datetime
55
import os
66
import re
7-
from typing import Dict, Tuple
87
from urllib.parse import quote_plus as quote_as_url
98

109
import pandas as pd
@@ -13,14 +12,9 @@
1312

1413
from delphi_utils.geomap import GeoMapper
1514

16-
from .constants import (
17-
TRANSFORMS, SIGNALS, COUNTS_7D_SIGNALS, NEWLINE,
15+
from .constants import (TRANSFORMS, SIGNALS, COUNTS_7D_SIGNALS, NEWLINE,
1816
IS_PROP, NOT_PROP,
19-
DOWNLOAD_ATTACHMENT, DOWNLOAD_LISTING,
20-
INTERP_LENGTH
21-
)
22-
23-
DataDict = Dict[Tuple[str, str, bool], pd.DataFrame]
17+
DOWNLOAD_ATTACHMENT, DOWNLOAD_LISTING)
2418

2519
# YYYYMMDD
2620
# example: "Community Profile Report 20211104.xlsx"
@@ -418,54 +412,27 @@ def fetch_listing(params):
418412
)
419413
for el in listing if el['filename'].endswith("xlsx")
420414
]
421-
keep = []
422415
if params['indicator']['reports'] == 'new':
423416
# drop files we already have in the input cache
424-
keep = [el for el in listing if not os.path.exists(el['cached_filename'])]
417+
listing = [el for el in listing if not os.path.exists(el['cached_filename'])]
425418
elif params['indicator']['reports'].find("--") > 0:
426419
# drop files outside the specified publish-date range
427420
start_str, _, end_str = params['indicator']['reports'].partition("--")
428421
start_date = datetime.datetime.strptime(start_str, "%Y-%m-%d").date()
429422
end_date = datetime.datetime.strptime(end_str, "%Y-%m-%d").date()
430-
keep = [
423+
listing = [
431424
el for el in listing
432425
if start_date <= el['publish_date'] <= end_date
433426
]
434-
435427
# reference date is guaranteed to be on or before publish date, so we can trim
436428
# reports that are too early
437429
if 'export_start_date' in params['indicator']:
438-
keep = [
439-
el for el in keep
430+
listing = [
431+
el for el in listing
440432
if params['indicator']['export_start_date'] <= el['publish_date']
441433
]
442434
# can't do the same for export_end_date
443-
444-
# if we're only running on a subset, make sure we have enough data for interp
445-
if keep:
446-
keep = extend_listing_for_interp(keep, listing)
447-
return keep if keep else listing
448-
449-
def extend_listing_for_interp(keep, listing):
450-
"""Grab additional files from the full listing for interpolation if needed.
451-
452-
Selects files based purely on publish_date, so may include duplicates where
453-
multiple reports for a single publish_date are available.
454-
455-
Parameters:
456-
- keep: list of reports desired in the final output
457-
- listing: complete list of reports available from healthdata.gov
458-
459-
Returns: list of reports including keep and additional files needed for
460-
interpolation.
461-
"""
462-
publish_date_keeplist = set()
463-
for el in keep:
464-
# starts at 0 so includes keep publish_dates
465-
for i in range(INTERP_LENGTH):
466-
publish_date_keeplist.add(el['publish_date'] - datetime.timedelta(days=i))
467-
keep = [el for el in listing if el['publish_date'] in publish_date_keeplist]
468-
return keep
435+
return listing
469436

470437
def download_and_parse(listing, logger):
471438
"""Convert a list of report files into Dataset instances."""
@@ -605,57 +572,8 @@ def fetch_new_reports(params, logger=None):
605572
if SIGNALS[sig]["make_prop"]:
606573
ret[(geo, sig, IS_PROP)] = generate_prop_signal(df, geo, geomapper)
607574

608-
ret = interpolate_missing_values(ret)
609-
610575
return ret
611576

612-
def interpolate_missing_values(dfs: DataDict) -> DataDict:
613-
"""Interpolates each signal in the dictionary of dfs."""
614-
interpolate_df = dict()
615-
for key, df in dfs.items():
616-
# Here we exclude the 'positivity' signal from interpolation. This is a temporary fix.
617-
# https://github.com/cmu-delphi/covidcast-indicators/issues/1576
618-
_, sig, _ = key
619-
if sig == "positivity":
620-
continue
621-
622-
geo_dfs = []
623-
for geo, group_df in df.groupby("geo_id"):
624-
reindexed_group_df = group_df.set_index("timestamp").reindex(
625-
pd.date_range(group_df.timestamp.min(), group_df.timestamp.max())
626-
)
627-
reindexed_group_df["geo_id"] = geo
628-
if "val" in reindexed_group_df.columns and not reindexed_group_df["val"].isna().all():
629-
reindexed_group_df["val"] = (
630-
reindexed_group_df["val"]
631-
.interpolate(method="linear", limit_area="inside")
632-
.astype(float)
633-
)
634-
if "se" in reindexed_group_df.columns:
635-
reindexed_group_df["se"] = (
636-
reindexed_group_df["se"]
637-
.interpolate(method="linear", limit_area="inside")
638-
.astype(float)
639-
)
640-
if (
641-
"sample_size" in reindexed_group_df.columns
642-
and not reindexed_group_df["sample_size"].isna().all()
643-
):
644-
reindexed_group_df["sample_size"] = (
645-
reindexed_group_df["sample_size"]
646-
.interpolate(method="linear", limit_area="inside")
647-
.astype(float)
648-
)
649-
if "publish_date" in reindexed_group_df.columns:
650-
reindexed_group_df["publish_date"] = reindexed_group_df["publish_date"].fillna(
651-
method="bfill"
652-
)
653-
geo_dfs.append(reindexed_group_df)
654-
interpolate_df[key] = (
655-
pd.concat(geo_dfs).reset_index().rename(columns={"index": "timestamp"})
656-
)
657-
return interpolate_df
658-
659577
def generate_prop_signal(df, geo, geo_mapper):
660578
"""Transform base df into a proportion (per 100k population)."""
661579
if geo == "state":

dsew_community_profile/tests/test_pull.py

Lines changed: 4 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -1,44 +1,20 @@
11
from collections import namedtuple
2-
from dataclasses import dataclass
3-
from datetime import date, datetime, timedelta
2+
from datetime import date, datetime
43
from itertools import chain
5-
from typing import Any, Dict, List, Union
64
import pandas as pd
7-
from pandas.util.testing import assert_frame_equal
85
import numpy as np
96
import pytest
107
from unittest.mock import patch, Mock
118

129
from delphi_utils.geomap import GeoMapper
1310

14-
from delphi_dsew_community_profile.pull import (
15-
DatasetTimes, Dataset,
11+
from delphi_dsew_community_profile.pull import (DatasetTimes, Dataset,
1612
fetch_listing, nation_from_state, generate_prop_signal,
17-
std_err, add_max_ts_col, unify_testing_sigs, interpolate_missing_values,
18-
extend_listing_for_interp
19-
)
13+
std_err, add_max_ts_col, unify_testing_sigs)
2014

2115

2216
example = namedtuple("example", "given expected")
23-
24-
def _assert_frame_equal(df1, df2, index_cols: List[str] = None):
25-
# Ensure same columns present.
26-
assert set(df1.columns) == set(df2.columns)
27-
# Ensure same column order.
28-
df1 = df1[df1.columns]
29-
df2 = df2[df1.columns]
30-
# Ensure same row order by using a common index and sorting.
31-
df1 = df1.set_index(index_cols).sort_index()
32-
df2 = df2.set_index(index_cols).sort_index()
33-
return assert_frame_equal(df1, df2)
34-
35-
def _set_df_dtypes(df: pd.DataFrame, dtypes: Dict[str, Any]) -> pd.DataFrame:
36-
df = df.copy()
37-
for k, v in dtypes.items():
38-
if k in df.columns:
39-
df[k] = df[k].astype(v)
40-
return df
41-
17+
4218
class TestPull:
4319
def test_DatasetTimes(self):
4420
examples = [
@@ -477,77 +453,3 @@ def test_std_err(self):
477453
"sample_size": [2, 2, 5, 10, 20, 0]
478454
})
479455
)
480-
481-
def test_interpolation(self):
482-
DTYPES = {"geo_id": str, "timestamp": "datetime64[ns]", "val": float, "se": float, "sample_size": float, "publish_date": "datetime64[ns]"}
483-
line = lambda x: 3 * x + 5
484-
485-
sig1 = _set_df_dtypes(pd.DataFrame({
486-
"geo_id": "1",
487-
"timestamp": pd.date_range("2022-01-01", "2022-01-10"),
488-
"val": [line(i) for i in range(2, 12)],
489-
"se": [line(i) for i in range(1, 11)],
490-
"sample_size": [line(i) for i in range(0, 10)],
491-
"publish_date": pd.to_datetime("2022-01-10")
492-
}), dtypes=DTYPES)
493-
# A linear signal missing two days which should be filled exactly by the linear interpolation.
494-
missing_sig1 = sig1[(sig1.timestamp <= "2022-01-05") | (sig1.timestamp >= "2022-01-08")]
495-
496-
sig2 = sig1.copy()
497-
sig2["geo_id"] = "2"
498-
# A linear signal missing everything but the end points, should be filled exactly by linear interpolation.
499-
missing_sig2 = sig2[(sig2.timestamp == "2022-01-01") | (sig2.timestamp == "2022-01-10")]
500-
501-
sig3 = _set_df_dtypes(pd.DataFrame({
502-
"geo_id": "3",
503-
"timestamp": pd.date_range("2022-01-01", "2022-01-10"),
504-
"val": None,
505-
"se": [line(i) for i in range(1, 11)],
506-
"sample_size": [line(i) for i in range(0, 10)],
507-
"publish_date": pd.to_datetime("2022-01-10")
508-
}), dtypes=DTYPES)
509-
# A signal missing everything, should be left alone.
510-
missing_sig3 = sig3[(sig3.timestamp <= "2022-01-05") | (sig3.timestamp >= "2022-01-08")]
511-
512-
sig4 = _set_df_dtypes(pd.DataFrame({
513-
"geo_id": "4",
514-
"timestamp": pd.date_range("2022-01-01", "2022-01-10"),
515-
"val": [None] * 9 + [10.0],
516-
"se": [line(i) for i in range(1, 11)],
517-
"sample_size": [line(i) for i in range(0, 10)],
518-
"publish_date": pd.to_datetime("2022-01-10")
519-
}), dtypes=DTYPES)
520-
# A signal missing everything except for one point, should be left alone.
521-
missing_sig4 = sig4[(sig4.timestamp <= "2022-01-05") | (sig4.timestamp >= "2022-01-08")]
522-
523-
missing_dfs = [missing_sig1, missing_sig2, missing_sig3, missing_sig4]
524-
interpolated_dfs1 = interpolate_missing_values({("src", "sig", False): pd.concat(missing_dfs)})
525-
expected_dfs = pd.concat([sig1, sig2, sig3, sig4])
526-
_assert_frame_equal(interpolated_dfs1[("src", "sig", False)], expected_dfs, index_cols=["geo_id", "timestamp"])
527-
528-
@patch("delphi_dsew_community_profile.pull.INTERP_LENGTH", 2)
529-
def test_extend_listing(self):
530-
listing = [
531-
{"publish_date": date(2020, 1, 20) - timedelta(days=i)}
532-
for i in range(20)
533-
]
534-
examples = [
535-
# single range
536-
example(
537-
[{"publish_date": date(2020, 1, 20)}],
538-
[{"publish_date": date(2020, 1, 20)}, {"publish_date": date(2020, 1, 19)}]
539-
),
540-
# disjoint ranges
541-
example(
542-
[{"publish_date": date(2020, 1, 20)}, {"publish_date": date(2020, 1, 10)}],
543-
[{"publish_date": date(2020, 1, 20)}, {"publish_date": date(2020, 1, 19)},
544-
{"publish_date": date(2020, 1, 10)}, {"publish_date": date(2020, 1, 9)}]
545-
),
546-
# conjoined ranges
547-
example(
548-
[{"publish_date": date(2020, 1, 20)}, {"publish_date": date(2020, 1, 19)}],
549-
[{"publish_date": date(2020, 1, 20)}, {"publish_date": date(2020, 1, 19)}, {"publish_date": date(2020, 1, 18)}]
550-
),
551-
]
552-
for ex in examples:
553-
assert extend_listing_for_interp(ex.given, listing) == ex.expected, ex.given

0 commit comments

Comments
 (0)