From be94c82324c18c9e7c12a0c79d83cacac1a2cd58 Mon Sep 17 00:00:00 2001 From: Yeray Diaz Diaz Date: Sun, 18 Mar 2018 18:53:06 +0000 Subject: [PATCH 1/2] Validate if a file with the same blake2 digest already exists (#2490) --- tests/unit/forklift/test_legacy.py | 104 +++++++++++++++++++++++++++++ warehouse/forklift/legacy.py | 17 +++-- 2 files changed, 116 insertions(+), 5 deletions(-) diff --git a/tests/unit/forklift/test_legacy.py b/tests/unit/forklift/test_legacy.py index 3bb8584ebde4..6237f39b3a6c 100644 --- a/tests/unit/forklift/test_legacy.py +++ b/tests/unit/forklift/test_legacy.py @@ -639,10 +639,52 @@ def test_is_duplicate_none(self, pyramid_config, db_request): ), ) + hashes["blake2_256"] = "another blake2 digest" + assert legacy._is_duplicate_file( db_request.db, requested_file_name, hashes ) is None + def test_is_duplicate_false_same_blake2(self, pyramid_config, db_request): + pyramid_config.testing_securitypolicy(userid=1) + + user = UserFactory.create() + EmailFactory.create(user=user) + project = ProjectFactory.create() + release = ReleaseFactory.create(project=project, version="1.0") + RoleFactory.create(user=user, project=project) + + filename = "{}-{}.tar.gz".format(project.name, release.version) + requested_file_name = "{}-{}-1.tar.gz".format(project.name, + release.version) + file_content = io.BytesIO(b"A fake file.") + file_value = file_content.getvalue() + + hashes = { + "sha256": hashlib.sha256(file_value).hexdigest(), + "md5": hashlib.md5(file_value).hexdigest(), + "blake2_256": hashlib.blake2b( + file_value, digest_size=256 // 8 + ).hexdigest() + } + db_request.db.add( + File( + release=release, + filename=filename, + md5_digest=hashes["md5"], + sha256_digest=hashes["sha256"], + blake2_256_digest=hashes["blake2_256"], + path="source/{name[0]}/{name}/{filename}".format( + name=project.name, + filename=filename, + ), + ), + ) + + assert legacy._is_duplicate_file( + db_request.db, requested_file_name, hashes + ) is False + def test_is_duplicate_false(self, pyramid_config, db_request): pyramid_config.testing_securitypolicy(userid=1) @@ -1788,6 +1830,68 @@ def test_upload_fails_with_existing_filename_diff_content(self, "See /the/help/url/" ) + def test_upload_fails_with_diff_filename_same_blake2(self, + pyramid_config, + db_request): + pyramid_config.testing_securitypolicy(userid=1) + + user = UserFactory.create() + project = ProjectFactory.create() + release = ReleaseFactory.create(project=project, version="1.0") + RoleFactory.create(user=user, project=project) + + filename = "{}-{}.tar.gz".format(project.name, release.version) + file_content = io.BytesIO(b"A fake file.") + + db_request.POST = MultiDict({ + "metadata_version": "1.2", + "name": project.name, + "version": release.version, + "filetype": "sdist", + "md5_digest": hashlib.md5(file_content.getvalue()).hexdigest(), + "content": pretend.stub( + filename="{}-fake.tar.gz".format(project.name), + file=file_content, + type="application/tar", + ), + }) + + db_request.db.add( + File( + release=release, + filename=filename, + md5_digest=hashlib.md5(file_content.getvalue()).hexdigest(), + sha256_digest=hashlib.sha256( + file_content.getvalue() + ).hexdigest(), + blake2_256_digest=hashlib.blake2b( + file_content.getvalue(), + digest_size=256 // 8 + ).hexdigest(), + path="source/{name[0]}/{name}/{filename}".format( + name=project.name, + filename=filename, + ), + ), + ) + db_request.route_url = pretend.call_recorder( + lambda route, **kw: "/the/help/url/" + ) + + with pytest.raises(HTTPBadRequest) as excinfo: + legacy.file_upload(db_request) + + resp = excinfo.value + + assert db_request.route_url.calls == [ + pretend.call('help', _anchor='file-name-reuse') + ] + assert resp.status_code == 400 + assert resp.status == ( + "400 File already exists. " + "See /the/help/url/" + ) + def test_upload_fails_with_wrong_filename(self, pyramid_config, db_request): pyramid_config.testing_securitypolicy(userid=1) diff --git a/warehouse/forklift/legacy.py b/warehouse/forklift/legacy.py index 79b3a7cfa942..b3ed1988b868 100644 --- a/warehouse/forklift/legacy.py +++ b/warehouse/forklift/legacy.py @@ -660,7 +660,9 @@ def _is_valid_dist_file(filename, filetype): def _is_duplicate_file(db_session, filename, hashes): """ - Check to see if file already exists, and if it's content matches + Check to see if file already exists, and if it's content matches. + A file is considered to exist if its filename *or* blake2 digest are + present in a file row in the database. Returns: - True: This file is a duplicate and all further processing should halt. @@ -670,14 +672,19 @@ def _is_duplicate_file(db_session, filename, hashes): file_ = ( db_session.query(File) - .filter(File.filename == filename) + .filter( + (File.filename == filename) | + (File.blake2_256_digest == hashes["blake2_256"])) .first() ) if file_ is not None: - return (file_.sha256_digest == hashes["sha256"] and - file_.md5_digest == hashes["md5"] and - file_.blake2_256_digest == hashes["blake2_256"]) + return ( + file_.filename == filename and + file_.sha256_digest == hashes["sha256"] and + file_.md5_digest == hashes["md5"] and + file_.blake2_256_digest == hashes["blake2_256"] + ) return None From c1a4e72cec083a66fffbb479984d8281e7d0ba00 Mon Sep 17 00:00:00 2001 From: Yeray Diaz Diaz Date: Tue, 20 Mar 2018 09:21:31 +0000 Subject: [PATCH 2/2] Update help text for "file or contents exist" --- tests/unit/forklift/test_legacy.py | 4 ++-- warehouse/forklift/legacy.py | 2 +- warehouse/templates/pages/help.html | 5 +++-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/unit/forklift/test_legacy.py b/tests/unit/forklift/test_legacy.py index 6237f39b3a6c..02eea1ece48a 100644 --- a/tests/unit/forklift/test_legacy.py +++ b/tests/unit/forklift/test_legacy.py @@ -1826,7 +1826,7 @@ def test_upload_fails_with_existing_filename_diff_content(self, ] assert resp.status_code == 400 assert resp.status == ( - "400 File already exists. " + "400 The filename or contents already exist. " "See /the/help/url/" ) @@ -1888,7 +1888,7 @@ def test_upload_fails_with_diff_filename_same_blake2(self, ] assert resp.status_code == 400 assert resp.status == ( - "400 File already exists. " + "400 The filename or contents already exist. " "See /the/help/url/" ) diff --git a/warehouse/forklift/legacy.py b/warehouse/forklift/legacy.py index b3ed1988b868..296bfdcde89d 100644 --- a/warehouse/forklift/legacy.py +++ b/warehouse/forklift/legacy.py @@ -1080,7 +1080,7 @@ def file_upload(request): return Response() elif is_duplicate is not None: raise _exc_with_message( - HTTPBadRequest, "File already exists. " + HTTPBadRequest, "The filename or contents already exist. " "See " + request.route_url( 'help', _anchor='file-name-reuse' diff --git a/warehouse/templates/pages/help.html b/warehouse/templates/pages/help.html index 8fea8d00dad3..d53a61aabbe5 100644 --- a/warehouse/templates/pages/help.html +++ b/warehouse/templates/pages/help.html @@ -32,7 +32,7 @@ {% macro private_indices() %}How can I publish my private packages to PyPI?{% endmacro %} {% macro admin_intervention() %}Why did my package or user registration get blocked?{% endmacro %} {% macro file_size_limit() %}How do I get a file size limit exemption or increase for my project?{% endmacro %} -{% macro file_name_reuse() %}Why am I getting a "File already exists" error?{% endmacro %} +{% macro file_name_reuse() %}Why am I getting a "Filename or contents already exists" or "Filename has been previously used" errors?{% endmacro %} {% macro project_name() %}Why isn't my desired project name available?{% endmacro %} {% macro project_name_claim() %}How do I claim an abandoned or previously registered project name?{% endmacro %} {% macro feedback() %}Where can I report a bug or provide feedback?{% endmacro %} @@ -224,11 +224,12 @@

{{ admin_intervention() }}

{{ file_name_reuse() }}

- The error HTTPError: 400 Client Error: File already exists happens for one of two reasons: + PyPI will return these errors for one of these reasons:

PyPI does not allow for a filename to be reused, even once a project has been deleted and recreated.