diff --git a/tests/unit/forklift/test_legacy.py b/tests/unit/forklift/test_legacy.py index 3cab000eb157..6bd87049aa22 100644 --- a/tests/unit/forklift/test_legacy.py +++ b/tests/unit/forklift/test_legacy.py @@ -58,6 +58,7 @@ Project, ProjectMacaroonWarningAssociation, Release, + ReleaseURL, Role, ) from warehouse.packaging.tasks import sync_file_to_cache, update_bigquery_release_files @@ -3831,6 +3832,155 @@ def failing_verify(_self, _verifier, _policy, _dist): assert resp.status_code == 400 assert resp.status.startswith(expected_msg) + @pytest.mark.parametrize( + "url, expected", + [ + ("https://google.com", False), # Totally different + ("https://github.com/foo", False), # Missing parts + ("https://github.com/foo/bar/", True), # Exactly the same + ("https://github.com/foo/bar/readme.md", True), # Additional parts + ("https://github.com/foo/bar", True), # Missing trailing slash + ], + ) + def test_new_release_url_verified( + self, monkeypatch, pyramid_config, db_request, metrics, url, expected + ): + project = ProjectFactory.create() + publisher = GitHubPublisherFactory.create(projects=[project]) + publisher.repository_owner = "foo" + publisher.repository_name = "bar" + claims = {"sha": "somesha"} + identity = PublisherTokenContext(publisher, SignedClaims(claims)) + db_request.oidc_publisher = identity.publisher + db_request.oidc_claims = identity.claims + + db_request.db.add(Classifier(classifier="Environment :: Other Environment")) + db_request.db.add(Classifier(classifier="Programming Language :: Python")) + + filename = "{}-{}.tar.gz".format(project.name, "1.0") + + pyramid_config.testing_securitypolicy(identity=identity) + db_request.user_agent = "warehouse-tests/6.6.6" + db_request.POST = MultiDict( + { + "metadata_version": "1.2", + "name": project.name, + "version": "1.0", + "summary": "This is my summary!", + "filetype": "sdist", + "md5_digest": _TAR_GZ_PKG_MD5, + "content": pretend.stub( + filename=filename, + file=io.BytesIO(_TAR_GZ_PKG_TESTDATA), + type="application/tar", + ), + } + ) + db_request.POST.extend( + [ + ("classifiers", "Environment :: Other Environment"), + ("classifiers", "Programming Language :: Python"), + ("requires_dist", "foo"), + ("requires_dist", "bar (>1.0)"), + ("project_urls", f"Test, {url}"), + ("requires_external", "Cheese (>1.0)"), + ("provides", "testing"), + ] + ) + + storage_service = pretend.stub(store=lambda path, filepath, meta: None) + db_request.find_service = lambda svc, name=None, context=None: { + IFileStorage: storage_service, + IMetricsService: metrics, + }.get(svc) + + legacy.file_upload(db_request) + release_url = ( + db_request.db.query(ReleaseURL).filter(Release.project == project).one() + ) + assert release_url is not None + assert release_url.verified == expected + + def test_new_publisher_verifies_existing_release_url( + self, + monkeypatch, + pyramid_config, + db_request, + metrics, + ): + repo_name = "my_new_repo" + verified_url = "https://github.com/foo/bar" + unverified_url = f"https://github.com/foo/{repo_name}" + + project = ProjectFactory.create() + release = ReleaseFactory.create(project=project, version="1.0") + # We start with an existing release, with one verified URL and one unverified + # URL. Uploading a new file with a Trusted Publisher that matches the unverified + # URL should mark it as verified. + release.project_urls = { + "verified_url": {"url": verified_url, "verified": True}, + "unverified_url": {"url": unverified_url, "verified": False}, + } + publisher = GitHubPublisherFactory.create(projects=[project]) + publisher.repository_owner = "foo" + publisher.repository_name = repo_name + claims = {"sha": "somesha"} + identity = PublisherTokenContext(publisher, SignedClaims(claims)) + db_request.oidc_publisher = identity.publisher + db_request.oidc_claims = identity.claims + + db_request.db.add(Classifier(classifier="Environment :: Other Environment")) + db_request.db.add(Classifier(classifier="Programming Language :: Python")) + + filename = "{}-{}.tar.gz".format(project.name, "1.0") + + pyramid_config.testing_securitypolicy(identity=identity) + db_request.user_agent = "warehouse-tests/6.6.6" + db_request.POST = MultiDict( + { + "metadata_version": "1.2", + "name": project.name, + "version": "1.0", + "summary": "This is my summary!", + "filetype": "sdist", + "md5_digest": _TAR_GZ_PKG_MD5, + "content": pretend.stub( + filename=filename, + file=io.BytesIO(_TAR_GZ_PKG_TESTDATA), + type="application/tar", + ), + } + ) + db_request.POST.extend( + [ + ("classifiers", "Environment :: Other Environment"), + ("classifiers", "Programming Language :: Python"), + ("requires_dist", "foo"), + ("requires_dist", "bar (>1.0)"), + ("requires_external", "Cheese (>1.0)"), + ("provides", "testing"), + ] + ) + db_request.POST.add("project_urls", f"verified_url, {verified_url}") + db_request.POST.add("project_urls", f"unverified_url, {unverified_url}") + + storage_service = pretend.stub(store=lambda path, filepath, meta: None) + db_request.find_service = lambda svc, name=None, context=None: { + IFileStorage: storage_service, + IMetricsService: metrics, + }.get(svc) + + legacy.file_upload(db_request) + + # After successful upload, the Release should have now both URLs verified + release_urls = ( + db_request.db.query(ReleaseURL).filter(Release.project == project).all() + ) + release_urls = {r.name: r.verified for r in release_urls} + assert "verified_url" in release_urls and "unverified_url" in release_urls + assert release_urls["verified_url"] + assert release_urls["unverified_url"] + @pytest.mark.parametrize( "version, expected_version", [ @@ -4640,3 +4790,86 @@ def test_missing_trailing_slash_redirect(pyramid_request): "/legacy/ (with a trailing slash)" ) assert resp.headers["Location"] == "/legacy/" + + +@pytest.mark.parametrize( + ("url", "publisher_url", "expected"), + [ + ( # GitHub trivial case + "https://github.com/owner/project", + "https://github.com/owner/project", + True, + ), + ( # ActiveState trivial case + "https://platform.activestate.com/owner/project", + "https://platform.activestate.com/owner/project", + True, + ), + ( # GitLab trivial case + "https://gitlab.com/owner/project", + "https://gitlab.com/owner/project", + True, + ), + ( # URL is a sub-path of the TP URL + "https://github.com/owner/project/issues", + "https://github.com/owner/project", + True, + ), + ( # Normalization + "https://GiThUB.com/owner/project/", + "https://github.com/owner/project", + True, + ), + ( # TP URL is a prefix, but not a parent of the URL + "https://github.com/owner/project22", + "https://github.com/owner/project", + False, + ), + ( # URL is a parent of the TP URL + "https://github.com/owner", + "https://github.com/owner/project", + False, + ), + ( # Scheme component does not match + "http://github.com/owner/project", + "https://github.com/owner/project", + False, + ), + ( # Host component does not match + "https://gitlab.com/owner/project", + "https://github.com/owner/project", + False, + ), + ( # Host component matches, but contains user and port info + "https://user@github.com:443/owner/project", + "https://github.com/owner/project", + False, + ), + ( # URL path component is empty + "https://github.com", + "https://github.com/owner/project", + False, + ), + ( # TP URL path component is empty + # (currently no TPs have an empty path, so even if the given URL is a + # sub-path of the TP URL, we fail the verification) + "https://github.com/owner/project", + "https://github.com", + False, + ), + ( # Both path components are empty + # (currently no TPs have an empty path, so even if the given URL is the + # same as the TP URL, we fail the verification) + "https://github.com", + "https://github.com", + False, + ), + ( # Publisher URL is None + "https://github.com/owner/project", + None, + False, + ), + ], +) +def test_verify_url(url, publisher_url, expected): + assert legacy._verify_url(url, publisher_url) == expected diff --git a/tests/unit/oidc/models/test_activestate.py b/tests/unit/oidc/models/test_activestate.py index dc6d1becde24..7b3ae0cb9e92 100644 --- a/tests/unit/oidc/models/test_activestate.py +++ b/tests/unit/oidc/models/test_activestate.py @@ -84,6 +84,18 @@ def test_publisher_name(self): assert publisher.publisher_name == "ActiveState" + def test_publisher_base_url(self): + org_name = "fakeorg" + project_name = "fakeproject" + publisher = ActiveStatePublisher( + organization=org_name, activestate_project_name=project_name + ) + + assert ( + publisher.publisher_base_url + == f"https://platform.activestate.com/{org_name}/{project_name}" + ) + def test_publisher_url(self): org_name = "fakeorg" project_name = "fakeproject" diff --git a/tests/unit/oidc/models/test_github.py b/tests/unit/oidc/models/test_github.py index 3fc818b1d31b..fa44b85478f0 100644 --- a/tests/unit/oidc/models/test_github.py +++ b/tests/unit/oidc/models/test_github.py @@ -97,6 +97,7 @@ def test_github_publisher_computed_properties(self): assert getattr(publisher, claim_name) is not None assert str(publisher) == "fakeworkflow.yml" + assert publisher.publisher_base_url == "https://github.com/fakeowner/fakerepo" assert publisher.publisher_url() == "https://github.com/fakeowner/fakerepo" assert ( publisher.publisher_url({"sha": "somesha"}) diff --git a/tests/unit/oidc/models/test_gitlab.py b/tests/unit/oidc/models/test_gitlab.py index d1016b017bbb..29efb60f9eba 100644 --- a/tests/unit/oidc/models/test_gitlab.py +++ b/tests/unit/oidc/models/test_gitlab.py @@ -95,6 +95,7 @@ def test_gitlab_publisher_computed_properties(self): assert getattr(publisher, claim_name) is not None assert str(publisher) == "subfolder/fakeworkflow.yml" + assert publisher.publisher_base_url == "https://gitlab.com/fakeowner/fakerepo" assert publisher.publisher_url() == "https://gitlab.com/fakeowner/fakerepo" assert ( publisher.publisher_url({"sha": "somesha"}) diff --git a/tests/unit/oidc/models/test_google.py b/tests/unit/oidc/models/test_google.py index ae4d72e98719..180376c51704 100644 --- a/tests/unit/oidc/models/test_google.py +++ b/tests/unit/oidc/models/test_google.py @@ -33,6 +33,11 @@ def test_publisher_name(self): assert publisher.publisher_name == "Google" + def test_publisher_base_url(self): + publisher = google.GooglePublisher(email="fake@example.com") + + assert publisher.publisher_base_url is None + def test_publisher_url(self): publisher = google.GooglePublisher(email="fake@example.com") diff --git a/warehouse/forklift/legacy.py b/warehouse/forklift/legacy.py index 75c9fa94b151..0ee97af807c1 100644 --- a/warehouse/forklift/legacy.py +++ b/warehouse/forklift/legacy.py @@ -25,6 +25,7 @@ import packaging.utils import packaging.version import packaging_legacy.version +import rfc3986 import sentry_sdk import wtforms import wtforms.validators @@ -456,6 +457,45 @@ def _process_attestations(request, distribution: Distribution): metrics.increment("warehouse.upload.attestations.ok") +def _verify_url(url: str, publisher_url: str | None) -> bool: + """ + Verify a given URL against a Trusted Publisher URL + + A URL is considered "verified" iff it matches the Trusted Publisher URL + such that, when both URLs are normalized: + - The scheme component is the same (e.g: both use `https`) + - The authority component is the same (e.g.: `github.com`) + - The path component is the same, or a sub-path of the Trusted Publisher URL + (e.g.: `org/project` and `org/project/issues.html` will pass verification + against an `org/project` Trusted Publisher path component) + - The path component of the Trusted Publisher URL is not empty + Note: We compare the authority component instead of the host component because + the authority includes the host, and in practice neither URL should have user + nor port information. + """ + if not publisher_url: + return False + + publisher_uri = rfc3986.api.uri_reference(publisher_url).normalize() + user_uri = rfc3986.api.uri_reference(url).normalize() + if publisher_uri.path is None: + # Currently no Trusted Publishers have an empty path component, + # so we defensively fail verification. + return False + elif user_uri.path and publisher_uri.path: + is_subpath = publisher_uri.path == user_uri.path or user_uri.path.startswith( + publisher_uri.path + "/" + ) + else: + is_subpath = publisher_uri.path == user_uri.path + + return ( + publisher_uri.scheme == user_uri.scheme + and publisher_uri.authority == user_uri.authority + and is_subpath + ) + + def _sort_releases(request: Request, project: Project): releases = ( request.db.query(Release) @@ -816,7 +856,23 @@ def file_upload(request): ), ) from None + # Verify any verifiable URLs + publisher_base_url = ( + request.oidc_publisher.publisher_base_url if request.oidc_publisher else None + ) + project_urls = ( + {} + if not meta.project_urls + else { + name: { + "url": url, + "verified": _verify_url(url=url, publisher_url=publisher_base_url), + } + for name, url in meta.project_urls.items() + } + ) try: + is_new_release = False canonical_version = packaging.utils.canonicalize_version(meta.version) release = ( request.db.query(Release) @@ -870,7 +926,7 @@ def file_upload(request): html=rendered or "", rendered_by=readme.renderer_version(), ), - project_urls=meta.project_urls or {}, + project_urls=project_urls, # TODO: Fix this, we currently treat platform as if it is a single # use field, but in reality it is a multi-use field, which the # packaging.metadata library handles correctly. @@ -909,6 +965,7 @@ def file_upload(request): uploaded_via=request.user_agent, ) request.db.add(release) + is_new_release = True # TODO: This should be handled by some sort of database trigger or # a SQLAlchemy hook or the like instead of doing it inline in @@ -1297,6 +1354,19 @@ def file_upload(request): }, ) + # For existing releases, we check if any of the existing project URLs are unverified + # and have been verified in the current upload. In that case, we mark them as + # verified. + if not is_new_release and project_urls: + for name, release_url in release._project_urls.items(): + if ( + not release_url.verified + and name in project_urls + and project_urls[name]["url"] == release_url.url + and project_urls[name]["verified"] + ): + release_url.verified = True + request.db.flush() # flush db now so server default values are populated for celery # Push updates to BigQuery diff --git a/warehouse/migrations/versions/26455e3712a2_create_verified_field_for_releaseurl.py b/warehouse/migrations/versions/26455e3712a2_create_verified_field_for_releaseurl.py new file mode 100644 index 000000000000..ff4623263f6c --- /dev/null +++ b/warehouse/migrations/versions/26455e3712a2_create_verified_field_for_releaseurl.py @@ -0,0 +1,41 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +create verified field for ReleaseUrl + +Revision ID: 26455e3712a2 +Revises: 208d494aac68 +Create Date: 2024-04-30 18:40:17.149050 +""" + +import sqlalchemy as sa + +from alembic import op + +revision = "26455e3712a2" +down_revision = "208d494aac68" + + +def upgrade(): + conn = op.get_bind() + conn.execute(sa.text("SET statement_timeout = 120000")) + conn.execute(sa.text("SET lock_timeout = 120000")) + op.add_column( + "release_urls", + sa.Column( + "verified", sa.Boolean(), server_default=sa.text("false"), nullable=False + ), + ) + + +def downgrade(): + op.drop_column("release_urls", "verified") diff --git a/warehouse/oidc/models/_core.py b/warehouse/oidc/models/_core.py index 9bcb619aec85..588cb4992473 100644 --- a/warehouse/oidc/models/_core.py +++ b/warehouse/oidc/models/_core.py @@ -284,6 +284,11 @@ def publisher_name(self) -> str: # pragma: no cover # Only concrete subclasses are constructed. raise NotImplementedError + @property + def publisher_base_url(self) -> str | None: # pragma: no cover + # Only concrete subclasses are constructed. + raise NotImplementedError + def publisher_url( self, claims: SignedClaims | None = None ) -> str | None: # pragma: no cover diff --git a/warehouse/oidc/models/activestate.py b/warehouse/oidc/models/activestate.py index e004f4fb601f..c005c06b7a98 100644 --- a/warehouse/oidc/models/activestate.py +++ b/warehouse/oidc/models/activestate.py @@ -118,11 +118,15 @@ def publisher_name(self) -> str: def project(self) -> str: return self.activestate_project_name - def publisher_url(self, claims: SignedClaims | None = None) -> str: + @property + def publisher_base_url(self) -> str: return urllib.parse.urljoin( _ACTIVESTATE_URL, f"{self.organization}/{self.activestate_project_name}" ) + def publisher_url(self, claims: SignedClaims | None = None) -> str: + return self.publisher_base_url + def stored_claims(self, claims=None): return {} diff --git a/warehouse/oidc/models/github.py b/warehouse/oidc/models/github.py index f3cc7016733a..a8f761c1a72d 100644 --- a/warehouse/oidc/models/github.py +++ b/warehouse/oidc/models/github.py @@ -239,13 +239,17 @@ def job_workflow_ref(self): def sub(self): return f"repo:{self.repository}" + @property + def publisher_base_url(self): + return f"https://github.com/{self.repository}" + @property def jti(self) -> str: """Placeholder value for JTI.""" return "placeholder" def publisher_url(self, claims=None): - base = f"https://github.com/{self.repository}" + base = self.publisher_base_url sha = claims.get("sha") if claims else None if sha: diff --git a/warehouse/oidc/models/gitlab.py b/warehouse/oidc/models/gitlab.py index 4897b2ed4c45..2ad4428a00be 100644 --- a/warehouse/oidc/models/gitlab.py +++ b/warehouse/oidc/models/gitlab.py @@ -223,13 +223,17 @@ def ci_config_ref_uri(self): def publisher_name(self): return "GitLab" + @property + def publisher_base_url(self): + return f"https://gitlab.com/{self.project_path}" + @property def jti(self) -> str: """Placeholder value for JTI.""" return "placeholder" def publisher_url(self, claims=None): - base = f"https://gitlab.com/{self.project_path}" + base = self.publisher_base_url return f"{base}/commit/{claims['sha']}" if claims else base def stored_claims(self, claims=None): diff --git a/warehouse/oidc/models/google.py b/warehouse/oidc/models/google.py index 9fc14e355bf3..2c4fc14e5d43 100644 --- a/warehouse/oidc/models/google.py +++ b/warehouse/oidc/models/google.py @@ -84,6 +84,10 @@ def __lookup_no_sub__(klass, signed_claims: SignedClaims) -> Query | None: def publisher_name(self): return "Google" + @property + def publisher_base_url(self): + return None + def publisher_url(self, claims=None): return None diff --git a/warehouse/packaging/models.py b/warehouse/packaging/models.py index 29e0e990ebb7..b0226c293cd5 100644 --- a/warehouse/packaging/models.py +++ b/warehouse/packaging/models.py @@ -501,6 +501,7 @@ class ReleaseURL(db.Model): name: Mapped[str] = mapped_column(String(32)) url: Mapped[str] + verified: Mapped[bool] = mapped_column(default=False) DynamicFieldsEnum = ENUM( @@ -610,7 +611,7 @@ def __table_args__(cls): # noqa project_urls = association_proxy( "_project_urls", "url", - creator=lambda k, v: ReleaseURL(name=k, url=v), + creator=lambda k, v: ReleaseURL(name=k, url=v["url"], verified=v["verified"]), ) files: Mapped[list[File]] = orm.relationship(