diff --git a/.gitignore b/.gitignore index 9c88f0f..b389802 100644 --- a/.gitignore +++ b/.gitignore @@ -157,3 +157,5 @@ src/pyosmeta/_version.py *.pickle .token token.txt + +.bash_profile diff --git a/README.md b/README.md index 1dbae3e..3c59636 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,6 @@ [![Publish to PyPI](https://github.com/pyOpenSci/pyosMeta/actions/workflows/publish-pypi.yml/badge.svg)](https://github.com/pyOpenSci/pyosMeta/actions/workflows/publish-pypi.yml) [![.github/workflows/test-run-script.yml](https://github.com/pyOpenSci/pyosMeta/actions/workflows/test-run-script.yml/badge.svg)](https://github.com/pyOpenSci/pyosMeta/actions/workflows/test-run-script.yml) - ## Description **pyosmeta** provides the tools and scripts used to manage [pyOpenSci](https://pyopensci.org)'s contributor and peer @@ -25,18 +24,17 @@ This repo contains a small module and several CLI scripts, including: _Since pyOpenSci uses this tool for its website, we expect this package to have infrequent releases._ - ## Installation Using pip: -``` +```console pip install pyosmeta ``` Using conda: -``` +```console conda install pyosmeta ``` diff --git a/src/pyosmeta/cli/process_reviews.py b/src/pyosmeta/cli/process_reviews.py index c71a181..53f45b6 100644 --- a/src/pyosmeta/cli/process_reviews.py +++ b/src/pyosmeta/cli/process_reviews.py @@ -38,8 +38,6 @@ def main(): process_review = ProcessIssues(github_api) # Get all issues for approved packages - load as dict - # TODO: this doesn't have to be in process issues at all. it could fully - # Call the github module issues = process_review.get_issues() accepted_reviews, errors = process_review.parse_issues(issues) for url, error in errors.items(): @@ -48,11 +46,9 @@ def main(): print("-" * 20) # Update gh metrics via api for all packages - print("Getting GitHub metrics for all packages...") - repo_endpoints = process_review.get_repo_endpoints(accepted_reviews) - all_reviews = process_review.get_gh_metrics( - repo_endpoints, accepted_reviews - ) + # Contrib count is only available via rest api + repo_paths = process_review.get_repo_paths(accepted_reviews) + all_reviews = github_api.get_gh_metrics(repo_paths, accepted_reviews) with open("all_reviews.pickle", "wb") as f: pickle.dump(all_reviews, f) diff --git a/src/pyosmeta/github_api.py b/src/pyosmeta/github_api.py index c8c7651..d7c94da 100644 --- a/src/pyosmeta/github_api.py +++ b/src/pyosmeta/github_api.py @@ -18,6 +18,8 @@ import requests from dotenv import load_dotenv +from pyosmeta.models import ReviewModel + @dataclass class GitHubAPI: @@ -131,6 +133,7 @@ def handle_rate_limit(self, response): If the remaining requests are exhausted, it calculates the time until the rate limit resets and sleeps accordingly. """ + print("hello there") if "X-RateLimit-Remaining" in response.headers: remaining_requests = int(response.headers["X-RateLimit-Remaining"]) @@ -139,25 +142,25 @@ def handle_rate_limit(self, response): sleep_time = max(reset_time - time.time(), 0) + 1 time.sleep(sleep_time) - def return_response(self) -> list[dict[str, object]]: - """ - Make a GET request to the Github API endpoint - Deserialize json response to list of dicts. + def _get_response_rest(self, url: str) -> list[dict[str, Any]]: + """Make a GET request to the GitHub REST API. + Handles pagination and rate limiting. - Handles pagination as github has a REST api 100 request max. + Parameters + ---------- + url : str + The API endpoint URL. Returns ------- - list - List of dict items each containing a review issue + list[dict[str, Any]] + A list of JSON responses from GitHub API requests. """ - results = [] - # This is computed as a property. Reassign here to support pagination - # and new urls for each page - api_endpoint_url = self.api_endpoint + api_endpoint_url = url + try: - while True: + while api_endpoint_url: response = requests.get( api_endpoint_url, headers={"Authorization": f"token {self.get_token()}"}, @@ -165,30 +168,52 @@ def return_response(self) -> list[dict[str, object]]: response.raise_for_status() results.extend(response.json()) - # Check if there are more pages to fetch - if "next" in response.links: - next_url = response.links["next"]["url"] - api_endpoint_url = next_url - else: - break - - # Handle rate limiting + # Handle pagination & rate limiting + api_endpoint_url = response.links.get("next", {}).get("url") self.handle_rate_limit(response) except requests.HTTPError as exception: if exception.response.status_code == 401: - print( - "Oops - your request isn't authorized. Your token may be " - "expired or invalid. Please refresh your token." + logging.error( + "Unauthorized request. Your token may be expired or invalid. Please refresh your token." ) else: raise exception return results - def get_repo_meta(self, url: str) -> dict[str, Any] | None: + def get_gh_metrics( + self, + endpoints: dict[dict[str, str]], + reviews: dict[str, ReviewModel], + ) -> dict[str, ReviewModel]: """ - Get GitHub metrics from the GitHub API for a single repository. + Get GitHub metrics for all reviews using provided repo name and owner. + Does not work on GitLab currently + + Parameters: + ---------- + endpoints : dict + A dictionary mapping package names to their owner and repo-names. + reviews : dict + A dictionary containing review data. + + Returns: + ------- + dict + Updated review data with GitHub metrics. + """ + + for pkg_name, owner_repo in endpoints.items(): + reviews[pkg_name].gh_meta = self.get_repo_meta(owner_repo) + + return reviews + + def _get_contrib_count_rest(self, url: str) -> int | None: + """ + Returns the count of total contributors to a repository. + + Uses the rest API because graphql can't access this specific metric Parameters ---------- @@ -197,116 +222,169 @@ def get_repo_meta(self, url: str) -> dict[str, Any] | None: Returns ------- - Optional[Dict[str, Any]] - A dictionary containing the specified GitHub metrics for the repository. - Returns None if the repository is not found or access is forbidden. + int + The count of total contributors to the repository. Notes ----- - This method makes a GET call to the GitHub API to retrieve metadata - about a pyos reviewed package repository. - - If the repository is not found (status code 404) or access is forbidden - (status code 403), this method returns None. + This method makes a GET call to the GitHub API to retrieve + total contributors for the specified repository. It then returns the + count of contributors. + If the repository is not found (status code 404), a warning message is + logged, and the method returns None. """ + # https://api.github.com/repos/{owner}/{repo}/contributors + repo_contribs_url = f"https://api.github.com/repos/{url['owner']}/{url['repo_name']}/contributors" + contributors = self._get_response_rest(repo_contribs_url) - # Get the url (normally the docs) and description of a repo - response = requests.get( - url, headers={"Authorization": f"token {self.get_token()}"} - ) - - # Check if the request was successful (status code 2xx) - if response.ok: - return response.json() - - # Handle specific HTTP errors - elif response.status_code == 404: + if not contributors: logging.warning( - f"Repository not found: {url}. Did the repo URL change?" - ) - return None - elif response.status_code == 403: - # Construct a single warning message with formatted strings - warning_message = ( - "Oops! You may have hit an API limit for URL: {url}.\n" - f"API Response Text: {response.text}\n" - f"API Response Headers: {response.headers}" + f"Repository not found: {repo_contribs_url}. Did the repo URL change?" ) - logging.warning(warning_message) return None - else: - # Log other HTTP errors - logging.warning( - f"Unexpected HTTP error: {response.status_code} URL: {url}" - ) - return None + return len(contributors) - def get_repo_contribs(self, url: str) -> int | None: + def _get_metrics_graphql( + self, repo_info: dict[str, str] + ) -> dict[str, Any] | None: """ - Returns the count of total contributors to a repository. + Get GitHub metrics from the GitHub GraphQL API for a single repository. Parameters ---------- - url : str - The URL of the repository. + repo_info : dict + A dictionary containing the owner and repository name. Returns ------- - int - The count of total contributors to the repository. + Optional[Dict[str, Any]] + A dictionary containing the specified GitHub metrics for the repository. + Returns None if the repository is not found or access is forbidden. Notes ----- - This method makes a GET call to the GitHub API to retrieve - total contributors for the specified repository. It then returns the - count of contributors. + This method makes a GraphQL call to the GitHub API to retrieve metadata + about a pyos reviewed package repository. - If the repository is not found (status code 404), a warning message is - logged, and the method returns None. + If the repository is not found or access is forbidden, this method returns None. """ - repo_contribs_url = url + "/contributors" + query = """ + query($owner: String!, $name: String!) { + repository(owner: $owner, name: $name) { + name + description + homepageUrl + createdAt + stargazers { + totalCount + } + watchers { + totalCount + } + issues(states: OPEN) { + totalCount + } + forks { + totalCount + } + defaultBranchRef { + target { + ... on Commit { + history(first: 1) { + edges { + node { + committedDate + } + } + } + } + } + } + collaborators { + totalCount + } + } + } + """ - # Get the url (normally the docs) and repository description - response = requests.get( - repo_contribs_url, - headers={"Authorization": f"token {self.get_token()}"}, + variables = { + "owner": repo_info["owner"], + "name": repo_info["repo_name"], + } + + headers = {"Authorization": f"Bearer {self.get_token()}"} + + response = requests.post( + "https://api.github.com/graphql", + json={"query": query, "variables": variables}, + headers=headers, ) - # Handle 404 error (Repository not found) - if response.status_code == 404: + if response.status_code == 200: + data = response.json() + repo_data = data["data"]["repository"] + + return { + "name": repo_data["name"], + "description": repo_data["description"], + "documentation": repo_data["homepageUrl"], + "created_at": repo_data["createdAt"], + "stargazers_count": repo_data["stargazers"]["totalCount"], + "watchers_count": repo_data["watchers"]["totalCount"], + "open_issues_count": repo_data["issues"]["totalCount"], + "forks_count": repo_data["forks"]["totalCount"], + "last_commit": repo_data["defaultBranchRef"]["target"][ + "history" + ]["edges"][0]["node"]["committedDate"], + } + elif response.status_code == 404: + logging.warning( + f"Repository not found: {repo_info['owner']}/{repo_info['repo_name']}. Did the repo URL change?" + ) + return None + elif response.status_code == 403: logging.warning( - f"Repository not found: {repo_contribs_url}. " - "Did the repo URL change?" + f"Oops! You may have hit an API limit for repository: {repo_info['owner']}/{repo_info['repo_name']}.\n" + f"API Response Text: {response.text}\n" + f"API Response Headers: {response.headers}" ) return None - # Return total contributors else: - return len(response.json()) + logging.warning( + f"Unexpected HTTP error: {response.status_code} for repository: {repo_info['owner']}/{repo_info['repo_name']}" + ) + return None - def get_last_commit(self, repo: str) -> str: - """Returns the last commit to the repository. + def get_repo_meta( + self, repo_info: dict[str, str] + ) -> dict[str, Any] | None: + """Get GitHub metrics from the GitHub GraphQL API for a repository. Parameters ---------- - str : string - A string containing a datetime object representing the datetime of - the last commit to the repo + repo_info : dict + A dictionary containing the owner and repository name. Returns ------- - str - String representing the timestamp for the last commit to the repo. + Optional[Dict[str, Any]] + A dictionary containing the specified GitHub metrics for the repository. + Returns None if the repository is not found or access is forbidden. + + Notes + ----- + This method makes a GraphQL call to the GitHub API to retrieve metadata + about a pyos reviewed package repository. + + If the repository is not found or access is forbidden, it returns None. """ - url = repo + "/commits" - response = requests.get( - url, headers={"Authorization": f"token {self.get_token()}"} - ).json() - date = response[0]["commit"]["author"]["date"] + metrics = self._get_metrics_graphql(repo_info) + metrics["contrib_count"] = self._get_contrib_count_rest(repo_info) - return date + return metrics def get_user_info( self, gh_handle: str, name: Optional[str] = None diff --git a/src/pyosmeta/parse_issues.py b/src/pyosmeta/parse_issues.py index 2c5fcfd..64d2f22 100644 --- a/src/pyosmeta/parse_issues.py +++ b/src/pyosmeta/parse_issues.py @@ -44,19 +44,6 @@ def __init__(self, github_api: GitHubAPI): self.github_api = github_api - # These are the github metrics to return on a package - # It could be simpler to implement this using graphQL - gh_stats = [ - "name", - "description", - "homepage", - "created_at", - "stargazers_count", - "watchers_count", - "open_issues_count", - "forks_count", - ] - def get_issues(self) -> list[Issue]: """ Call return response in GitHub api object. @@ -75,10 +62,11 @@ def get_issues(self) -> list[Issue]: need to use an OR as a selector. """ - issues = self.github_api.return_response() - # Filter labels according to label select input - labels = self.github_api.labels + url = self.github_api.api_endpoint + issues = self.github_api._get_response_rest(url) + # Filter issues according to label query value + labels = self.github_api.labels filtered_issues = [ issue for issue in issues @@ -303,7 +291,7 @@ def parse_issue(self, issue: Issue | str) -> ReviewModel: ], ) - # Finalize review model before casting + # Finalize & cleanup review model before casting model = self._postprocess_meta(model, body) model = self._postprocess_labels(model) @@ -333,6 +321,7 @@ def parse_issues( errors = {} for issue in issues: print(f"Processing review {issue.title}") + try: review = self.parse_issue(issue) reviews[review.package_name] = review @@ -367,12 +356,15 @@ def get_contributor_data( models = models[0] return models - # TODO: decide if this belongs here or in the github obj? - def get_repo_endpoints( + # TODO: This now returns a dict of owner:repo_name to support graphql + def get_repo_paths( self, review_issues: dict[str, ReviewModel] - ) -> dict[str, str]: + ) -> dict[str, dict[str, str]]: """ - Returns a list of repository endpoints + Returns a dictionary of repository owner and names for each package. + + Currently we don't have API access setup for gitlab. So skip if + url contains gitlab Parameters ---------- @@ -381,104 +373,26 @@ def get_repo_endpoints( Returns ------- - Dict - Containing pkg_name: endpoint for each review. - + dict + Containing pkg_name: {owner: repo} for each review. """ all_repos = {} for a_package in review_issues.keys(): - repo = review_issues[a_package].repository_link.strip("/") - owner, repo = repo.split("/")[-2:] - # TODO: could be simpler code - Remove any link remnants - pattern = r"[\(\)\[\]?]" - owner = re.sub(pattern, "", owner) - repo = re.sub(pattern, "", repo) - all_repos[a_package] = ( - f"https://api.github.com/repos/{owner}/{repo}" - ) - return all_repos - - # Rename to process_gh_metrics? - def get_gh_metrics( - self, - endpoints: dict[str, str], - reviews: dict[str, ReviewModel], - ) -> dict[str, ReviewModel]: - """ - Get GitHub metrics for each review based on provided endpoints. - - Parameters: - ---------- - endpoints : dict - A dictionary mapping package names to their GitHub URLs. - reviews : dict - A dictionary containing review data. - - Returns: - ------- - dict - Updated review data with GitHub metrics. - """ - pkg_meta = {} - # url is the api endpoint for a specific pyos-reviewed package repo - for pkg_name, url in endpoints.items(): - print(f"Processing GitHub metrics {pkg_name}") - pkg_meta[pkg_name] = self.process_repo_meta(url) - - # These 2 lines both hit the API directly - pkg_meta[pkg_name]["contrib_count"] = ( - self.github_api.get_repo_contribs(url) - ) - pkg_meta[pkg_name]["last_commit"] = ( - self.github_api.get_last_commit(url) + repo_url = review_issues[a_package].repository_link + # for now skip if it's a gitlab repo + if "gitlab" in repo_url: + continue + owner, repo = ( + repo_url.replace("https://github.com/", "") + .replace("https://www.github.com/", "") + .rstrip("/") + .split("/", 1) ) - # Add github meta to review metadata - reviews[pkg_name].gh_meta = pkg_meta[pkg_name] - return reviews - - # This is also github related - def process_repo_meta(self, url: str) -> dict[str, Any]: - """ - Process metadata from the GitHub API about a single repository. - - Parameters - ---------- - url : str - The URL of the repository. - - Returns - ------- - dict[str, Any] - A dictionary containing the specified GitHub metrics for the repo. - - Notes - ----- - This method uses our github module to process returned data from a - GET call to the GitHub API to retrieve metadata about a package - repository. It then extracts the desired metrics from the API response - and constructs a dictionary with these metrics. - - The `homepage` metric is renamed to `documentation` in the returned - dictionary. - - """ - - stats_dict = {} - # Returns the raw top-level github API response for the repo - gh_repo_response = self.github_api.get_repo_meta(url) - - # Retrieve the metrics that we want to track - for astat in self.gh_stats: - stats_dict[astat] = gh_repo_response[astat] - - stats_dict["documentation"] = stats_dict.pop("homepage") - - return stats_dict + all_repos[a_package] = {"owner": owner, "repo_name": repo} + return all_repos - # This works - i could just make it more generic and remove fmt since it's - # not used and replace it with a number of values and a test string def get_categories( self, issue_list: list[str], diff --git a/tests/unit/test_github_api.py b/tests/unit/test_githubapi.py similarity index 100% rename from tests/unit/test_github_api.py rename to tests/unit/test_githubapi.py diff --git a/tests/unit/test_githubapi_rate_limiting.py b/tests/unit/test_githubapi_rate_limiting.py new file mode 100644 index 0000000..2e70343 --- /dev/null +++ b/tests/unit/test_githubapi_rate_limiting.py @@ -0,0 +1,40 @@ +import time +from unittest.mock import Mock, patch + +from pyosmeta.github_api import GitHubAPI + + +class TestGitHubAPI: + def setup_method(self): + """Set up an instance of GitHubAPI before each test.""" + self.api = GitHubAPI() + + def test_no_rate_limit(self): + """Test when rate limit is not reached (should not sleep).""" + mock_response = Mock(headers={"X-RateLimit-Remaining": "10"}) + with patch("time.sleep") as mock_sleep: + self.api.handle_rate_limit(mock_response) + mock_sleep.assert_not_called() + + def test_rate_limit_reached(self): + """Test when rate limit is exhausted (should sleep until reset).""" + mock_response = Mock( + headers={ + "X-RateLimit-Remaining": "0", + # Reset in 10 seconds + "X-RateLimit-Reset": str(int(time.time()) + 10), + } + ) + with patch("time.sleep") as mock_sleep: + self.api.handle_rate_limit(mock_response) + # Sleep should be called once + mock_sleep.assert_called_once() + sleep_time = mock_sleep.call_args[0][0] + assert 9 <= sleep_time <= 11 + + def test_missing_headers(self): + """Test when rate limit headers are missing (should do nothing).""" + mock_response = Mock(headers={}) # No rate limit headers + with patch("time.sleep") as mock_sleep: + self.api.handle_rate_limit(mock_response) + mock_sleep.assert_not_called() diff --git a/tests/unit/test_githubapi_return_response_rest.py b/tests/unit/test_githubapi_return_response_rest.py new file mode 100644 index 0000000..39157a4 --- /dev/null +++ b/tests/unit/test_githubapi_return_response_rest.py @@ -0,0 +1,110 @@ +from unittest.mock import Mock, patch + +import pytest +import requests + +from pyosmeta.github_api import GitHubAPI + + +class TestGitHubAPI: + @pytest.fixture(autouse=True) + def setup_method(self): + """Set up an instance of GitHubAPI before each test.""" + self.api = GitHubAPI() + + # Create default mock response + mock_response = Mock() + mock_response.json.return_value = [{"id": 1, "name": "repo1"}] + mock_response.links = {} # No pagination + mock_response.status_code = 200 + mock_response.headers = {"X-RateLimit-Remaining": "10"} + # Patch requests.get for all tests + self.mock_get_patcher = patch( + "requests.get", return_value=mock_response + ) + self.mock_get = self.mock_get_patcher.start() + + def teardown_method(self): + """Stop the patch after each test.""" + patch.stopall() + + def test_single_page_response(self): + """Test a successful API response with a single page (no pagination).""" + print("hi") + # print(self.mock_get.call_args) + result = self.api._get_response_rest( + "https://api.github.com/repos/test" + ) + assert result == [{"id": 1, "name": "repo1"}] + + def test_multi_page_response(self): + """Test handling multiple pages (pagination). + This test is unusual because we setup a non paginated response + in the setup method. So we have to stop that mock and restart it with + new values for it to work properly. + """ + # Make sure the setup mock doesn't propagate here + self.mock_get.stop() + # repatch requests.get with new mock behavior (paginated) + self.mock_get = patch("requests.get").start() + # Simulate pagination with multiple response Mocks + self.mock_get.side_effect = [ + Mock( + json=Mock(return_value=[{"id": 1, "name": "repo1"}]), + links={ + "next": {"url": "https://api.github.com/repos/test?page=2"} + }, + status_code=200, + headers={"X-RateLimit-Remaining": "10"}, + ), + Mock( + json=Mock(return_value=[{"id": 2, "name": "repo2"}]), + links={}, + status_code=200, + headers={"X-RateLimit-Remaining": "10"}, + ), + ] + + result = self.api._get_response_rest( + "https://api.github.com/repos/test" + ) + assert result == [ + {"id": 1, "name": "repo1"}, + {"id": 2, "name": "repo2"}, + ] + assert self.mock_get.call_count == 2 + + @patch.object(GitHubAPI, "handle_rate_limit") + def test_rate_limit_handling(self, mock_handle_rate_limit): + """Test that rate limiting is handled correctly.""" + result = self.api._get_response_rest( + "https://api.github.com/repos/test" + ) + assert result == [{"id": 1, "name": "repo1"}] + mock_handle_rate_limit.assert_called_once_with( + self.mock_get.return_value + ) + + def test_unauthorized_request(self): + """Test handling of an unauthorized (401) response.""" + self.mock_get.return_value.status_code = 401 + self.mock_get.return_value.raise_for_status.side_effect = ( + requests.HTTPError(response=self.mock_get.return_value) + ) + + with patch("logging.error") as mock_log: + result = self.api._get_response_rest( + "https://api.github.com/repos/test" + ) + assert result == [] + mock_log.assert_called_once() + + def test_general_http_error(self): + """Test handling of a general HTTP error (e.g., 500).""" + self.mock_get.return_value.status_code = 500 + self.mock_get.return_value.raise_for_status.side_effect = ( + requests.HTTPError(response=self.mock_get.return_value) + ) + + with pytest.raises(requests.HTTPError): + self.api._get_response_rest("https://api.github.com/repos/test")