pyOpenSci · lwasser · Apr 16, 2025 · Mar 12, 2025 · Mar 12, 2025 · Mar 12, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,9 @@ See [GitHub releases](https://github.com/pyOpenSci/pyosMeta/releases) page for a
 
 ## [Unreleased]
 
+* Use a consistent logger for informational/debug outputs. Using print statements can make it tough to track down which line of code emitted the message and using the `warnings` module will suppress recurring warnings.
+* Added `tqdm` as a dependency to improve progress monitoring when running data processing scripts (@banesullivan)
+
 ## [v1.6] - 2025-02-17
 
 ## What's Changed

diff --git a/pyproject.toml b/pyproject.toml
@@ -29,6 +29,7 @@ dependencies = [
     "python-dotenv",
     "requests",
     "ruamel-yaml>=0.17.21",
+    "tqdm",
 ]
 # This is metadata that pip reads to understand what Python versions your package supports
 requires-python = ">=3.10"
@@ -42,7 +43,7 @@ dev = [
     "pre-commit",
     "pytest",
     "pytest-cov",
-    "pytest-mock"
+    "pytest-mock",
 ]
 
 [project.urls]

diff --git a/src/pyosmeta/cli/process_reviews.py b/src/pyosmeta/cli/process_reviews.py
@@ -26,6 +26,7 @@
 
 from pyosmeta import ProcessIssues
 from pyosmeta.github_api import GitHubAPI
+from pyosmeta.logging import logger
 
 
 def main():
@@ -40,15 +41,17 @@ def main():
     # Get all issues for approved packages - load as dict
     issues = process_review.get_issues()
     accepted_reviews, errors = process_review.parse_issues(issues)
-    for url, error in errors.items():
-        print(f"Error in review at url: {url}")
-        print(error)
-        print("-" * 20)
-    if len(errors):
+    if errors:
+        logger.error("Errors found when parsing reviews (printed to stdout):")
+        for url, error in errors.items():
+            print(f"Error in review at url: {url}")
+            print(error)
+            print("-" * 20)
         raise RuntimeError("Errors in parsing reviews, see printout above")
 
     # Update gh metrics via api for all packages
     # Contrib count is only available via rest api
+    logger.info("Getting GitHub metrics for all packages...")
     repo_paths = process_review.get_repo_paths(accepted_reviews)
     all_reviews = github_api.get_gh_metrics(repo_paths, accepted_reviews)
 

diff --git a/src/pyosmeta/cli/update_contributors.py b/src/pyosmeta/cli/update_contributors.py
@@ -14,10 +14,13 @@
 from datetime import datetime
 
 from pydantic import ValidationError
+from tqdm import tqdm
+from tqdm.contrib.logging import logging_redirect_tqdm
 
 from pyosmeta.contributors import ProcessContributors
 from pyosmeta.file_io import create_paths, load_pickle, open_yml_file
 from pyosmeta.github_api import GitHubAPI
+from pyosmeta.logging import logger
 from pyosmeta.models import PersonModel
 
 
@@ -64,29 +67,28 @@ def main():
 
     # Populate all existing contribs into model objects
     all_contribs = {}
-    for a_contrib in web_contribs:
-        print(a_contrib["github_username"])
-        try:
-            all_contribs[a_contrib["github_username"].lower()] = PersonModel(
-                **a_contrib
-            )
-        except ValidationError as ve:
-            print(a_contrib["github_username"])
-            print(ve)
-
-    print("Done processing all-contribs")
+    for a_contrib in tqdm(web_contribs, desc="Processing all-contribs"):
+        username = a_contrib["github_username"]
+        tqdm.write(f"Processing {username}")
+        with logging_redirect_tqdm():
+            try:
+                all_contribs[username.lower()] = PersonModel(**a_contrib)
+            except ValidationError:
+                logger.error(f"Error processing {username}", exc_info=True)
 
     # Create a list of all contributors across repositories
     github_api = GitHubAPI()
     process_contribs = ProcessContributors(github_api, json_files)
     bot_all_contribs = process_contribs.combine_json_data()
 
-    print("Updating contrib types and searching for new users now")
-    for key, users in bot_all_contribs.items():
+    for key, users in tqdm(
+        bot_all_contribs.items(),
+        desc="Updating contrib types and searching for new users",
+    ):
         for gh_user in users:
             # Find and populate data for any new contributors
             if gh_user not in all_contribs.keys():
-                print("Missing", gh_user, "Adding them now")
+                logger.info(f"Missing {gh_user}, adding them now")
                 new_contrib = process_contribs.return_user_info(gh_user)
                 new_contrib["date_added"] = datetime.now().strftime("%Y-%m-%d")
                 all_contribs[gh_user] = PersonModel(**new_contrib)
@@ -95,8 +97,8 @@ def main():
             all_contribs[gh_user].add_unique_value("contributor_type", key)
 
     if update_all:
-        for user in all_contribs.keys():
-            print("Updating all user info from github", user)
+        for user in tqdm(all_contribs.keys(), dec="Updating all user info"):
+            tqdm.write("Updating all user info from github for {user}")
             new_gh_data = process_contribs.return_user_info(user)
 
             # TODO: turn this into a small update method
@@ -127,7 +129,9 @@ def main():
             try:
                 setattr(data, "date_added", history[user])
             except KeyError:
-                print(f"Username {user} must be new, skipping")
+                logger.error(
+                    f"Username {user} must be new, skipping", exc_info=True
+                )
 
     # Export to pickle which supports updates after parsing reviews
     with open("all_contribs.pickle", "wb") as f:

diff --git a/src/pyosmeta/cli/update_review_teams.py b/src/pyosmeta/cli/update_review_teams.py
@@ -24,10 +24,13 @@
 from datetime import datetime
 
 from pydantic import ValidationError
+from tqdm import tqdm
+from tqdm.contrib.logging import logging_redirect_tqdm
 
 from pyosmeta.contributors import ProcessContributors
 from pyosmeta.file_io import clean_export_yml, load_pickle
 from pyosmeta.github_api import GitHubAPI
+from pyosmeta.logging import logger
 from pyosmeta.models import PersonModel, ReviewModel, ReviewUser
 from pyosmeta.utils_clean import get_clean_user
 
@@ -106,13 +109,16 @@ def process_user(
     if gh_user not in contribs.keys():
         # If they aren't in the existing contribs.yml data, add them by using
         # their github username and hitting the github api
-        print("Found a new contributor!", gh_user)
+        logger.info(f"Found a new contributor: {gh_user}")
         new_contrib = processor.return_user_info(gh_user)
         new_contrib["date_added"] = datetime.now().strftime("%Y-%m-%d")
         try:
             contribs[gh_user] = PersonModel(**new_contrib)
-        except ValidationError as ve:
-            print(ve)
+        except ValidationError:
+            logger.error(
+                f"Error processing new contributor {gh_user}. Skipping this user.",
+                exc_info=True,
+            )
 
     # Update user the list of contribution types if there are new types to add
     # for instance a new reviewer would have a "Reviewer" contributor type
@@ -143,33 +149,42 @@ def main():
 
     contrib_types = process_contribs.contrib_types
 
-    for pkg_name, review in packages.items():
-        print("Processing review team for:", pkg_name)
-        for role in contrib_types.keys():
-            user: list[ReviewUser] | ReviewUser = getattr(review, role)
-
-            # Eic is a newer field, so in some instances it will be empty
-            # if it's empty print a message noting the data are missing
-            if user:
-                # Handle lists or single users separately
-                if isinstance(user, list):
-                    for i, a_user in enumerate(user):
-                        a_user, contribs = process_user(
-                            a_user, role, pkg_name, contribs, process_contribs
+    for pkg_name, review in tqdm(
+        packages.items(), desc="Processing review teams"
+    ):
+        with logging_redirect_tqdm():
+            tqdm.write(f"Processing review team for: {pkg_name}")
+            for role in contrib_types.keys():
+                user: list[ReviewUser] | ReviewUser = getattr(review, role)
+
+                # Eic is a newer field, so in some instances it will be empty
+                # if it's empty log a message noting the data are missing
+                if user:
+                    # Handle lists or single users separately
+                    if isinstance(user, list):
+                        for i, a_user in enumerate(user):
+                            a_user, contribs = process_user(
+                                a_user,
+                                role,
+                                pkg_name,
+                                contribs,
+                                process_contribs,
+                            )
+                            # Update individual user in reference to issue list
+                            user[i] = a_user
+                    elif isinstance(user, ReviewUser):
+                        user, contribs = process_user(
+                            user, role, pkg_name, contribs, process_contribs
+                        )
+                        setattr(review, role, user)
+                    else:
+                        raise TypeError(
+                            "Keys in the `contrib_types` map must be a `ReviewUser` or `list[ReviewUser]` in the `ReviewModel`"
                         )
-                        # Update individual user in reference to issue list
-                        user[i] = a_user
-                elif isinstance(user, ReviewUser):
-                    user, contribs = process_user(
-                        user, role, pkg_name, contribs, process_contribs
-                    )
-                    setattr(review, role, user)
                 else:
-                    raise TypeError(
-                        "Keys in the `contrib_types` map must be a `ReviewUser` or `list[ReviewUser]` in the `ReviewModel`"
+                    logger.warning(
+                        f"I can't find a username for {role} under {pkg_name}. Moving on."
                     )
-            else:
-                print(f"I can't find a username for {role}. Moving on.")
 
     # Export to yaml
     contribs_ls = [model.model_dump() for model in contribs.values()]

diff --git a/src/pyosmeta/contributors.py b/src/pyosmeta/contributors.py
@@ -5,6 +5,7 @@
 import requests
 
 from .github_api import GitHubAPI
+from .logging import logger
 
 
 @dataclass
@@ -102,8 +103,10 @@
         """
         try:
             response = requests.get(json_path)
-        except Exception as ae:
-            print(ae)
+        except Exception:
+            logger.error(
+                f"Error loading json file: {json_path}", exec_info=True
+            )
         return json.loads(response.text)
 
     def process_json_file(self, json_file: str) -> Tuple[str, List]:
@@ -150,8 +153,10 @@
             try:
                 key, users = self.process_json_file(json_file)
                 combined_data[key] = users
-            except Exception as e:
-                print("Oops - can't process", json_file, e)
+            except Exception:
+                logger.error(
+                    f"Oops - can't process: {json_file}", exc_info=True
+                )
         return combined_data
 
     def return_user_info(
@@ -269,6 +274,6 @@
 
             # If the user is not in the web dict, add them
             else:
-                print("New user found. Adding: ", gh_user)
+                logger.info(f"New user found. Adding: {gh_user}")
                 webDict[gh_user] = repoDict[gh_user]
         return webDict
diff --git a/src/pyosmeta/file_io.py b/src/pyosmeta/file_io.py
@@ -5,6 +5,8 @@
 import ruamel.yaml
 from ruamel.yaml import YAML
 
+from .logging import logger
+
 
 def load_pickle(filename):
     """Opens a pickle"""
@@ -84,8 +86,8 @@
         with urllib.request.urlopen(file_path) as f:
             yaml = YAML(typ="safe", pure=True)
             return yaml.load(f)
-    except urllib.error.URLError as url_error:
-        print("Oops - can find the url", file_path, url_error)
+    except urllib.error.URLError:
+        logger.error(f"Oops - can find the url: {file_path}", exc_info=True)
-        logger.error(f"Oops - can find the url: {file_path}", exc_info=True)
+        logger.error(f"Oops - I can't find the url: {file_path}", exc_info=True)
-        logger.error(f"Oops - can find the url: {file_path}", exc_info=True)
+        logger.error(f"Oops - I can't find the url: {file_path}", exc_info=True)
 
 
 def export_yaml(filename: str, data_list: list):

diff --git a/src/pyosmeta/github_api.py b/src/pyosmeta/github_api.py
@@ -9,7 +9,6 @@
 numbers, stars and more "health & stability" related metrics
 """
 
-import logging
 import os
 import time
 from dataclasses import dataclass
@@ -20,6 +19,8 @@
 
 from pyosmeta.models import ReviewModel
 
+from .logging import logger
+
 
 @dataclass
 class GitHubAPI:
@@ -172,7 +173,7 @@
 
         except requests.HTTPError as exception:
             if exception.response.status_code == 401:
-                logging.error(
+                logger.error(
                     "Unauthorized request. Your token may be expired or invalid. Please refresh your token."
                 )
             else:
@@ -237,7 +238,7 @@
         contributors = self._get_response_rest(repo_contribs_url)
 
         if not contributors:
-            logging.warning(
+            logger.warning(
                 f"Repository not found: {repo_contribs_url}. Did the repo URL change?"
             )
             return None
@@ -339,19 +340,19 @@
                 ]["edges"][0]["node"]["committedDate"],
             }
         elif response.status_code == 404:
-            logging.warning(
+            logger.warning(
                 f"Repository not found: {repo_info['owner']}/{repo_info['repo_name']}. Did the repo URL change?"
             )
             return None
         elif response.status_code == 403:
-            logging.warning(
+            logger.warning(
                 f"Oops! You may have hit an API limit for repository: {repo_info['owner']}/{repo_info['repo_name']}.\n"
                 f"API Response Text: {response.text}\n"
                 f"API Response Headers: {response.headers}"
             )
             return None
         else:
-            logging.warning(
+            logger.warning(
                 f"Unexpected HTTP error: {response.status_code} for repository: {repo_info['owner']}/{repo_info['repo_name']}"
             )
             return None

diff --git a/src/pyosmeta/logging.py b/src/pyosmeta/logging.py
@@ -0,0 +1,4 @@
+import logging
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
diff --git a/src/pyosmeta/models/base.py b/src/pyosmeta/models/base.py
@@ -17,6 +17,7 @@
     field_validator,
 )
 
+from pyosmeta.logging import logger
 from pyosmeta.models.github import Labels
 from pyosmeta.utils_clean import (
     check_url,
@@ -58,15 +59,19 @@
             return url  # Returns empty string if url is empty
         else:
             if url.startswith("http://"):
-                print(f"{url} 'http://' replacing w 'https://'")
+                logger.warning(
+                    f"Oops, http protocol for {url}, changing to https"
+                )
                 url = url.replace("http://", "https://")
             elif not url.startswith("http"):
-                print("Oops, missing http")
+                logger.warning(
+                    "Oops, missing http protocol for {url}, adding it"
+                )
                 url = "https://" + url
         if check_url(url=url):
             return url
         else:  # pragma: no cover
-            print(f"Oops, url `{url}` is not valid, removing it")
+            logger.warning(f"Oops, url `{url}` is not valid, removing it")
             return None