cmu-delphi · krivard · Jan 27, 2023 · Jan 21, 2023 · Jan 21, 2023 · Jan 25, 2023
diff --git a/src/acquisition/covidcast/csv_importer.py b/src/acquisition/covidcast/csv_importer.py
@@ -1,20 +1,26 @@
 """Collects and reads covidcast data from a set of local CSV files."""
 
 # standard library
-from dataclasses import dataclass
-from datetime import date
-import glob
 import os
 import re
+from dataclasses import dataclass
+from datetime import date
+from glob import glob
+from typing import Iterator, NamedTuple, Optional, Tuple
 
 # third party
-import pandas as pd
 import epiweeks as epi
+import pandas as pd
 
 # first party
 from delphi_utils import Nans
 from delphi.utils.epiweek import delta_epiweeks
-from .logger import get_structured_logger
+from delphi.epidata.acquisition.covidcast.database import CovidcastRow
+from delphi.epidata.acquisition.covidcast.logger import get_structured_logger
+
+DFRow = NamedTuple('DFRow', [('geo_id', str), ('value', float), ('stderr', float), ('sample_size', float), ('missing_value', int), ('missing_stderr', int), ('missing_sample_size', int)])
+PathDetails = NamedTuple('PathDetails', [('issue', int), ('lag', int), ('source', str), ('signal', str), ('time_type', str), ('time_value', int), ('geo_type', str)])
+
 
 @dataclass
 class CsvRowValue:
@@ -27,6 +33,7 @@ class CsvRowValue:
   missing_stderr: int
   missing_sample_size: int
 
+
 class CsvImporter:
   """Finds and parses covidcast CSV files."""
 
@@ -60,6 +67,7 @@ class CsvImporter:
     "missing_sample_size": "Int64"
   }
 
+
   @staticmethod
   def is_sane_day(value):
     """Return whether `value` is a sane (maybe not valid) YYYYMMDD date.
@@ -76,6 +84,7 @@ def is_sane_day(value):
       return False
     return date(year=year,month=month,day=day)
 
+
   @staticmethod
   def is_sane_week(value):
     """Return whether `value` is a sane (maybe not valid) YYYYWW epiweek.
@@ -91,22 +100,24 @@ def is_sane_week(value):
       return False
     return value
 
+
   @staticmethod
-  def find_issue_specific_csv_files(scan_dir, glob=glob):
+  def find_issue_specific_csv_files(scan_dir):
     logger = get_structured_logger('find_issue_specific_csv_files')
-    for path in sorted(glob.glob(os.path.join(scan_dir, '*'))):
+    for path in sorted(glob(os.path.join(scan_dir, '*'))):
       issuedir_match = CsvImporter.PATTERN_ISSUE_DIR.match(path.lower())
       if issuedir_match and os.path.isdir(path):
         issue_date_value = int(issuedir_match.group(2))
         issue_date = CsvImporter.is_sane_day(issue_date_value)
         if issue_date:
           logger.info(event='processing csv files from issue', detail=issue_date, file=path)
-          yield from CsvImporter.find_csv_files(path, issue=(issue_date, epi.Week.fromdate(issue_date)), glob=glob)
+          yield from CsvImporter.find_csv_files(path, issue=(issue_date, epi.Week.fromdate(issue_date)))
         else:
           logger.warning(event='invalid issue directory day', detail=issue_date_value, file=path)
 
+
   @staticmethod
-  def find_csv_files(scan_dir, issue=(date.today(), epi.Week.fromdate(date.today())), glob=glob):
+  def find_csv_files(scan_dir, issue=(date.today(), epi.Week.fromdate(date.today()))):
     """Recursively search for and yield covidcast-format CSV files.
 
     scan_dir: the directory to scan (recursively)
@@ -122,11 +133,11 @@ def find_csv_files(scan_dir, issue=(date.today(), epi.Week.fromdate(date.today()
     issue_value=-1
     lag_value=-1
 
-    for path in sorted(glob.glob(os.path.join(scan_dir, '*', '*'))):
-
+    for path in sorted(glob(os.path.join(scan_dir, '*', '*'))):
+      # safe to ignore this file
       if not path.lower().endswith('.csv'):
-        # safe to ignore this file
         continue
+
       # match a daily or weekly naming pattern
       daily_match = CsvImporter.PATTERN_DAILY.match(path.lower())
       weekly_match = CsvImporter.PATTERN_WEEKLY.match(path.lower())
@@ -174,14 +185,16 @@ def find_csv_files(scan_dir, issue=(date.today(), epi.Week.fromdate(date.today()
         yield (path, None)
         continue
 
-      yield (path, (source, signal, time_type, geo_type, time_value, issue_value, lag_value))
+      yield (path, PathDetails(issue_value, lag_value, source, signal, time_type, time_value, geo_type))
+
 
   @staticmethod
   def is_header_valid(columns):
     """Return whether the given pandas columns contains the required fields."""
 
     return set(columns) >= CsvImporter.REQUIRED_COLUMNS
 
+
   @staticmethod
   def floaty_int(value: str) -> int:
     """Cast a string to an int, even if it looks like a float.
@@ -195,6 +208,7 @@ def floaty_int(value: str) -> int:
       raise ValueError('not an int: "%s"' % str(value))
     return int(float_value)
 
+
   @staticmethod
   def maybe_apply(func, quantity):
     """Apply the given function to the given quantity if not null-ish."""
@@ -205,6 +219,7 @@ def maybe_apply(func, quantity):
     else:
       return func(quantity)
 
+
   @staticmethod
   def validate_quantity(row, attr_quantity):
     """Take a row and validate a given associated quantity (e.g., val, se, stderr).
@@ -218,6 +233,7 @@ def validate_quantity(row, attr_quantity):
       # val was a string or another data
       return "Error"
 
+
   @staticmethod
   def validate_missing_code(row, attr_quantity, attr_name, filepath=None, logger=None):
     """Take a row and validate the missing code associated with
@@ -250,8 +266,9 @@ def validate_missing_code(row, attr_quantity, attr_name, filepath=None, logger=N
 
     return missing_entry
 
+
   @staticmethod
-  def extract_and_check_row(row, geo_type, filepath=None):
+  def extract_and_check_row(row: DFRow, geo_type: str, filepath: Optional[str] = None) -> Tuple[Optional[CsvRowValue], Optional[str]]:
     """Extract and return `CsvRowValue` from a CSV row, with sanity checks.
 
     Also returns the name of the field which failed sanity check, or None.
@@ -331,8 +348,9 @@ def extract_and_check_row(row, geo_type, filepath=None):
     # return extracted and validated row values
     return (CsvRowValue(geo_id, value, stderr, sample_size, missing_value, missing_stderr, missing_sample_size), None)
 
+
   @staticmethod
-  def load_csv(filepath, geo_type):
+  def load_csv(filepath: str, details: PathDetails) -> Iterator[Optional[CovidcastRow]]:
     """Load, validate, and yield data as `RowValues` from a CSV file.
 
     filepath: the CSV file to be loaded
@@ -357,9 +375,32 @@ def load_csv(filepath, geo_type):
     table.rename(columns={"val": "value", "se": "stderr", "missing_val": "missing_value", "missing_se": "missing_stderr"}, inplace=True)
 
     for row in table.itertuples(index=False):
-      row_values, error = CsvImporter.extract_and_check_row(row, geo_type, filepath)
+      csv_row_values, error = CsvImporter.extract_and_check_row(row, details.geo_type, filepath)
+
       if error:
         logger.warning(event = 'invalid value for row', detail=(str(row), error), file=filepath)
         yield None
         continue
-      yield row_values
+
+      yield CovidcastRow(
+        details.source,
+        details.signal,
+        details.time_type,
+        details.geo_type,
+        details.time_value,
+        csv_row_values.geo_value,
+        csv_row_values.value,
+        csv_row_values.stderr,
+        csv_row_values.sample_size,
+        csv_row_values.missing_value,
+        csv_row_values.missing_stderr,
+        csv_row_values.missing_sample_size,
+        details.issue,
+        details.lag,
+        # These four fields are unused by database acquisition
+        # TODO: These will be used when CovidcastRow is updated.
+        # id=None,
+        # direction=None,
+        # direction_updated_timestamp=0,
+        # value_updated_timestamp=0,
+      )
diff --git a/src/acquisition/covidcast/csv_to_database.py b/src/acquisition/covidcast/csv_to_database.py
@@ -4,10 +4,12 @@
 import argparse
 import os
 import time
+from logging import Logger
+from typing import Callable, Iterable, Optional, Tuple
 
 # first party
-from delphi.epidata.acquisition.covidcast.csv_importer import CsvImporter
-from delphi.epidata.acquisition.covidcast.database import Database, CovidcastRow, DBLoadStateException
+from delphi.epidata.acquisition.covidcast.csv_importer import CsvImporter, PathDetails
+from delphi.epidata.acquisition.covidcast.database import Database, DBLoadStateException
 from delphi.epidata.acquisition.covidcast.file_archiver import FileArchiver
 from delphi.epidata.acquisition.covidcast.logger import get_structured_logger
 
@@ -28,17 +30,19 @@ def get_argument_parser():
     help="filename for log output (defaults to stdout)")
   return parser
 
-def collect_files(data_dir, specific_issue_date,csv_importer_impl=CsvImporter):
+
+def collect_files(data_dir: str, specific_issue_date: bool):
   """Fetch path and data profile details for each file to upload."""
   logger= get_structured_logger('collect_files')
   if specific_issue_date:
-    results = list(csv_importer_impl.find_issue_specific_csv_files(data_dir))
+    results = list(CsvImporter.find_issue_specific_csv_files(data_dir))
   else:
-    results = list(csv_importer_impl.find_csv_files(os.path.join(data_dir, 'receiving')))
+    results = list(CsvImporter.find_csv_files(os.path.join(data_dir, 'receiving')))
   logger.info(f'found {len(results)} files')
   return results
 
-def make_handlers(data_dir, specific_issue_date, file_archiver_impl=FileArchiver):
+
+def make_handlers(data_dir: str, specific_issue_date: bool):
   if specific_issue_date:
     # issue-specific uploads are always one-offs, so we can leave all
     # files in place without worrying about cleaning up
@@ -47,7 +51,7 @@ def handle_failed(path_src, filename, source, logger):
 
     def handle_successful(path_src, filename, source, logger):
       logger.info(event='archiving as successful',file=filename)
-      file_archiver_impl.archive_inplace(path_src, filename)
+      FileArchiver.archive_inplace(path_src, filename)
   else:
     # normal automation runs require some shuffling to remove files
     # from receiving and place them in the archive
@@ -59,22 +63,24 @@ def handle_failed(path_src, filename, source, logger):
       logger.info(event='archiving as failed - ', detail=source, file=filename)
       path_dst = os.path.join(archive_failed_dir, source)
       compress = False
-      file_archiver_impl.archive_file(path_src, path_dst, filename, compress)
+      FileArchiver.archive_file(path_src, path_dst, filename, compress)
 
     # helper to archive a successful file with compression
     def handle_successful(path_src, filename, source, logger):
       logger.info(event='archiving as successful',file=filename)
       path_dst = os.path.join(archive_successful_dir, source)
       compress = True
-      file_archiver_impl.archive_file(path_src, path_dst, filename, compress)
+      FileArchiver.archive_file(path_src, path_dst, filename, compress)
+
   return handle_successful, handle_failed
 
+
 def upload_archive(
-    path_details,
-    database,
-    handlers,
-    logger,
-    csv_importer_impl=CsvImporter):
+  path_details: Iterable[Tuple[str, Optional[PathDetails]]],
+  database: Database,
+  handlers: Tuple[Callable],
+  logger: Logger
+  ):
   """Upload CSVs to the database and archive them using the specified handlers.
 
   :path_details: output from CsvImporter.find*_csv_files
@@ -89,20 +95,16 @@ def upload_archive(
   total_modified_row_count = 0
   # iterate over each file
   for path, details in path_details:
-    logger.info(event='handling',dest=path)
+    logger.info(event='handling', dest=path)
     path_src, filename = os.path.split(path)
 
+    # file path or name was invalid, source is unknown
     if not details:
-      # file path or name was invalid, source is unknown
       archive_as_failed(path_src, filename, 'unknown',logger)
       continue
 
-    (source, signal, time_type, geo_type, time_value, issue, lag) = details
-
-    csv_rows = csv_importer_impl.load_csv(path, geo_type)
-
-    cc_rows = CovidcastRow.fromCsvRows(csv_rows, source, signal, time_type, geo_type, time_value, issue, lag)
-    rows_list = list(cc_rows)
+    csv_rows = CsvImporter.load_csv(path, details)
+    rows_list = list(csv_rows)
     all_rows_valid = rows_list and all(r is not None for r in rows_list)
     if all_rows_valid:
       try:
@@ -111,12 +113,13 @@ def upload_archive(
         logger.info(
           "Inserted database rows",
           row_count = modified_row_count,
-          source = source,
-          signal = signal,
-          geo_type = geo_type,
-          time_value = time_value,
-          issue = issue,
-          lag = lag)
+          source = details.source,
+          signal = details.signal,
+          geo_type = details.geo_type,
+          time_value = details.time_value,
+          issue = details.issue,
+          lag = details.lag
+        )
         if modified_row_count is None or modified_row_count: # else would indicate zero rows inserted
           total_modified_row_count += (modified_row_count if modified_row_count else 0)
           database.commit()
@@ -131,40 +134,37 @@ def upload_archive(
 
     # archive the current file based on validation results
     if all_rows_valid:
-      archive_as_successful(path_src, filename, source, logger)
+      archive_as_successful(path_src, filename, details.source, logger)
     else:
-      archive_as_failed(path_src, filename, source,logger)
+      archive_as_failed(path_src, filename, details.source, logger)
 
   return total_modified_row_count
 
 
-def main(
-    args,
-    database_impl=Database,
-    collect_files_impl=collect_files,
-    upload_archive_impl=upload_archive):
+def main(args):
   """Find, parse, and upload covidcast signals."""
 
   logger = get_structured_logger("csv_ingestion", filename=args.log_file)
   start_time = time.time()
 
   # shortcut escape without hitting db if nothing to do
-  path_details = collect_files_impl(args.data_dir, args.specific_issue_date)
+  path_details = collect_files(args.data_dir, args.specific_issue_date)
   if not path_details:
     logger.info('nothing to do; exiting...')
     return
 
   logger.info("Ingesting CSVs", csv_count = len(path_details))
 
-  database = database_impl()
+  database = Database()
   database.connect()
 
   try:
-    modified_row_count = upload_archive_impl(
+    modified_row_count = upload_archive(
       path_details,
       database,
       make_handlers(args.data_dir, args.specific_issue_date),
-      logger)
+      logger
+    )
     logger.info("Finished inserting/updating database rows", row_count = modified_row_count)
   finally:
     database.do_analyze()
@@ -175,5 +175,6 @@ def main(
       "Ingested CSVs into database",
       total_runtime_in_seconds=round(time.time() - start_time, 2))
 
+
 if __name__ == '__main__':
   main(get_argument_parser().parse_args())