Skip to content

Validate if a file with the same blake2 digest already exists (#2490) #3310

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Mar 20, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 105 additions & 1 deletion tests/unit/forklift/test_legacy.py
Original file line number Diff line number Diff line change
Expand Up @@ -639,10 +639,52 @@ def test_is_duplicate_none(self, pyramid_config, db_request):
),
)

hashes["blake2_256"] = "another blake2 digest"

assert legacy._is_duplicate_file(
db_request.db, requested_file_name, hashes
) is None

def test_is_duplicate_false_same_blake2(self, pyramid_config, db_request):
pyramid_config.testing_securitypolicy(userid=1)

user = UserFactory.create()
EmailFactory.create(user=user)
project = ProjectFactory.create()
release = ReleaseFactory.create(project=project, version="1.0")
RoleFactory.create(user=user, project=project)

filename = "{}-{}.tar.gz".format(project.name, release.version)
requested_file_name = "{}-{}-1.tar.gz".format(project.name,
release.version)
file_content = io.BytesIO(b"A fake file.")
file_value = file_content.getvalue()

hashes = {
"sha256": hashlib.sha256(file_value).hexdigest(),
"md5": hashlib.md5(file_value).hexdigest(),
"blake2_256": hashlib.blake2b(
file_value, digest_size=256 // 8
).hexdigest()
}
db_request.db.add(
File(
release=release,
filename=filename,
md5_digest=hashes["md5"],
sha256_digest=hashes["sha256"],
blake2_256_digest=hashes["blake2_256"],
path="source/{name[0]}/{name}/{filename}".format(
name=project.name,
filename=filename,
),
),
)

assert legacy._is_duplicate_file(
db_request.db, requested_file_name, hashes
) is False

def test_is_duplicate_false(self, pyramid_config, db_request):
pyramid_config.testing_securitypolicy(userid=1)

Expand Down Expand Up @@ -1792,7 +1834,69 @@ def test_upload_fails_with_existing_filename_diff_content(self,
]
assert resp.status_code == 400
assert resp.status == (
"400 File already exists. "
"400 The filename or contents already exist. "
"See /the/help/url/"
)

def test_upload_fails_with_diff_filename_same_blake2(self,
pyramid_config,
db_request):
pyramid_config.testing_securitypolicy(userid=1)

user = UserFactory.create()
project = ProjectFactory.create()
release = ReleaseFactory.create(project=project, version="1.0")
RoleFactory.create(user=user, project=project)

filename = "{}-{}.tar.gz".format(project.name, release.version)
file_content = io.BytesIO(b"A fake file.")

db_request.POST = MultiDict({
"metadata_version": "1.2",
"name": project.name,
"version": release.version,
"filetype": "sdist",
"md5_digest": hashlib.md5(file_content.getvalue()).hexdigest(),
"content": pretend.stub(
filename="{}-fake.tar.gz".format(project.name),
file=file_content,
type="application/tar",
),
})

db_request.db.add(
File(
release=release,
filename=filename,
md5_digest=hashlib.md5(file_content.getvalue()).hexdigest(),
sha256_digest=hashlib.sha256(
file_content.getvalue()
).hexdigest(),
blake2_256_digest=hashlib.blake2b(
file_content.getvalue(),
digest_size=256 // 8
).hexdigest(),
path="source/{name[0]}/{name}/{filename}".format(
name=project.name,
filename=filename,
),
),
)
db_request.route_url = pretend.call_recorder(
lambda route, **kw: "/the/help/url/"
)

with pytest.raises(HTTPBadRequest) as excinfo:
legacy.file_upload(db_request)

resp = excinfo.value

assert db_request.route_url.calls == [
pretend.call('help', _anchor='file-name-reuse')
]
assert resp.status_code == 400
assert resp.status == (
"400 The filename or contents already exist. "
"See /the/help/url/"
)

Expand Down
19 changes: 13 additions & 6 deletions warehouse/forklift/legacy.py
Original file line number Diff line number Diff line change
Expand Up @@ -660,7 +660,9 @@ def _is_valid_dist_file(filename, filetype):

def _is_duplicate_file(db_session, filename, hashes):
"""
Check to see if file already exists, and if it's content matches
Check to see if file already exists, and if it's content matches.
A file is considered to exist if its filename *or* blake2 digest are
present in a file row in the database.

Returns:
- True: This file is a duplicate and all further processing should halt.
Expand All @@ -670,14 +672,19 @@ def _is_duplicate_file(db_session, filename, hashes):

file_ = (
db_session.query(File)
.filter(File.filename == filename)
.filter(
(File.filename == filename) |
(File.blake2_256_digest == hashes["blake2_256"]))
.first()
)

if file_ is not None:
return (file_.sha256_digest == hashes["sha256"] and
file_.md5_digest == hashes["md5"] and
file_.blake2_256_digest == hashes["blake2_256"])
return (
file_.filename == filename and
file_.sha256_digest == hashes["sha256"] and
file_.md5_digest == hashes["md5"] and
file_.blake2_256_digest == hashes["blake2_256"]
)

return None

Expand Down Expand Up @@ -1075,7 +1082,7 @@ def file_upload(request):
return Response()
elif is_duplicate is not None:
raise _exc_with_message(
HTTPBadRequest, "File already exists. "
HTTPBadRequest, "The filename or contents already exist. "
"See " +
request.route_url(
'help', _anchor='file-name-reuse'
Expand Down
5 changes: 3 additions & 2 deletions warehouse/templates/pages/help.html
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
{% macro private_indices() %}How can I publish my private packages to PyPI?{% endmacro %}
{% macro admin_intervention() %}Why did my package or user registration get blocked?{% endmacro %}
{% macro file_size_limit() %}How do I get a file size limit exemption or increase for my project?{% endmacro %}
{% macro file_name_reuse() %}Why am I getting a "File already exists" error?{% endmacro %}
{% macro file_name_reuse() %}Why am I getting a "Filename or contents already exists" or "Filename has been previously used" errors?{% endmacro %}
{% macro project_name() %}Why isn't my desired project name available?{% endmacro %}
{% macro project_name_claim() %}How do I claim an abandoned or previously registered project name?{% endmacro %}
{% macro feedback() %}Where can I report a bug or provide feedback?{% endmacro %}
Expand Down Expand Up @@ -224,11 +224,12 @@ <h3 id="admin-intervention">{{ admin_intervention() }}</h3>

<h3 id="file-name-reuse">{{ file_name_reuse() }}</h3>
<p>
The error <i>HTTPError: 400 Client Error: File already exists</i> happens for one of two reasons:
PyPI will return these errors for one of these reasons:
</p>
<ul>
<li>Filename has been used and file exists</li>
<li>Filename has been used but file no longer exists</li>
<li>A file with the exact same content exists</li>
</ul>
<p>
PyPI does not allow for a filename to be reused, even once a project has been deleted and recreated.
Expand Down