Skip to content

Commit 3b3c209

Browse files
committed
pip.download: refactor and try to clarify
add _check_download_dir and _download_http_url to ease the understanding of unpack_http_url and unpack_file_url functions
1 parent 4f249fa commit 3b3c209

File tree

1 file changed

+94
-102
lines changed

1 file changed

+94
-102
lines changed

pip/download.py

Lines changed: 94 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -513,6 +513,8 @@ def resp_read(chunk_size):
513513
finally:
514514
if show_progress:
515515
logger.end_progress('%s downloaded' % format_size(downloaded))
516+
if link.hash and link.hash_name:
517+
_check_hash(download_hash, link)
516518
return download_hash
517519

518520

@@ -547,102 +549,28 @@ def unpack_http_url(link, location, download_dir=None, session=None):
547549
)
548550

549551
temp_dir = tempfile.mkdtemp('-unpack', 'pip-')
550-
from_path = None
551-
target_url = link.url.split('#', 1)[0]
552-
553-
download_hash = None
554552

555553
# If a download dir is specified, is the file already downloaded there?
556-
already_downloaded = False
554+
already_downloaded_path = None
557555
if download_dir:
558-
download_path = os.path.join(download_dir, link.filename)
559-
if os.path.exists(download_path):
560-
# If already downloaded, does its hash match?
561-
content_type = mimetypes.guess_type(download_path)[0]
562-
logger.notify('File was already downloaded %s' % download_path)
563-
if link.hash:
564-
download_hash = _get_hash_from_file(download_path, link)
565-
try:
566-
_check_hash(download_hash, link)
567-
already_downloaded = True
568-
except HashMismatch:
569-
logger.warn(
570-
'Previously-downloaded file %s has bad hash, '
571-
're-downloading.' % download_path
572-
)
573-
os.unlink(download_path)
574-
already_downloaded = False
575-
else:
576-
already_downloaded = True
556+
already_downloaded_path = _check_download_dir(link, download_dir)
577557

578-
if already_downloaded:
579-
from_path = download_path
558+
if already_downloaded_path:
559+
from_path = already_downloaded_path
580560
content_type = mimetypes.guess_type(from_path)[0]
581561
else:
582562
# let's download to a tmp dir
583-
try:
584-
resp = session.get(
585-
target_url,
586-
# We use Accept-Encoding: identity here because requests
587-
# defaults to accepting compressed responses. This breaks in
588-
# a variety of ways depending on how the server is configured.
589-
# - Some servers will notice that the file isn't a compressible
590-
# file and will leave the file alone and with an empty
591-
# Content-Encoding
592-
# - Some servers will notice that the file is already
593-
# compressed and will leave the file alone and will add a
594-
# Content-Encoding: gzip header
595-
# - Some servers won't notice anything at all and will take
596-
# a file that's already been compressed and compress it again
597-
# and set the Content-Encoding: gzip header
598-
# By setting this to request only the identity encoding We're
599-
# hoping to eliminate the third case. Hopefully there does not
600-
# exist a server which when given a file will notice it is
601-
# already compressed and that you're not asking for a
602-
# compressed file and will then decompress it before sending
603-
# because if that's the case I don't think it'll ever be
604-
# possible to make this work.
605-
headers={"Accept-Encoding": "identity"},
606-
stream=True,
607-
)
608-
resp.raise_for_status()
609-
except requests.HTTPError as exc:
610-
logger.fatal("HTTP error %s while getting %s" %
611-
(exc.response.status_code, link))
612-
raise
613-
614-
content_type = resp.headers.get('content-type', '')
615-
filename = link.filename # fallback
616-
# Have a look at the Content-Disposition header for a better guess
617-
content_disposition = resp.headers.get('content-disposition')
618-
if content_disposition:
619-
type, params = cgi.parse_header(content_disposition)
620-
# We use ``or`` here because we don't want to use an "empty" value
621-
# from the filename param.
622-
filename = params.get('filename') or filename
623-
ext = splitext(filename)[1]
624-
if not ext:
625-
ext = mimetypes.guess_extension(content_type)
626-
if ext:
627-
filename += ext
628-
if not ext and link.url != resp.url:
629-
ext = os.path.splitext(resp.url)[1]
630-
if ext:
631-
filename += ext
632-
from_path = os.path.join(temp_dir, filename)
633-
download_hash = _download_url(resp, link, from_path)
634-
if link.hash and link.hash_name:
635-
_check_hash(download_hash, link)
563+
from_path, content_type = _download_http_url(link, session, temp_dir)
636564

637565
# unpack the archive to the build dir location. even when only downloading
638566
# archives, they have to be unpacked to parse dependencies
639567
unpack_file(from_path, location, content_type, link)
640568

641569
# a download dir is specified; let's copy the archive there
642-
if download_dir and not already_downloaded:
570+
if download_dir and not already_downloaded_path:
643571
_copy_file(from_path, download_dir, content_type, link)
644572

645-
if not already_downloaded:
573+
if not already_downloaded_path:
646574
os.unlink(from_path)
647575
os.rmdir(temp_dir)
648576

@@ -653,7 +581,6 @@ def unpack_file_url(link, location, download_dir=None):
653581
of the link file inside download_dir."""
654582

655583
link_path = url_to_path(link.url_without_fragment)
656-
already_downloaded = False
657584

658585
# If it's a url to a local directory
659586
if os.path.isdir(link_path):
@@ -670,27 +597,12 @@ def unpack_file_url(link, location, download_dir=None):
670597
_check_hash(link_path_hash, link)
671598

672599
# If a download dir is specified, is the file already there and valid?
600+
already_downloaded_path = None
673601
if download_dir:
674-
download_path = os.path.join(download_dir, link.filename)
675-
if os.path.exists(download_path):
676-
content_type = mimetypes.guess_type(download_path)[0]
677-
logger.notify('File was already downloaded %s' % download_path)
678-
if link.hash:
679-
download_hash = _get_hash_from_file(download_path, link)
680-
try:
681-
_check_hash(download_hash, link)
682-
already_downloaded = True
683-
except HashMismatch:
684-
logger.warn(
685-
'Previously-downloaded file %s has bad hash, '
686-
're-downloading.' % link_path
687-
)
688-
os.unlink(download_path)
689-
else:
690-
already_downloaded = True
602+
already_downloaded_path = _check_download_dir(link, download_dir)
691603

692-
if already_downloaded:
693-
from_path = download_path
604+
if already_downloaded_path:
605+
from_path = already_downloaded_path
694606
else:
695607
from_path = link_path
696608

@@ -701,7 +613,7 @@ def unpack_file_url(link, location, download_dir=None):
701613
unpack_file(from_path, location, content_type, link)
702614

703615
# a download dir is specified and not already downloaded
704-
if download_dir and not already_downloaded:
616+
if download_dir and not already_downloaded_path:
705617
_copy_file(from_path, download_dir, content_type, link)
706618

707619

@@ -765,3 +677,83 @@ def unpack_url(link, location, download_dir=None,
765677
)
766678
if only_download:
767679
write_delete_marker_file(location)
680+
681+
682+
def _download_http_url(link, session, temp_dir):
683+
"""Download link url into temp_dir using provided session"""
684+
target_url = link.url.split('#', 1)[0]
685+
try:
686+
resp = session.get(
687+
target_url,
688+
# We use Accept-Encoding: identity here because requests
689+
# defaults to accepting compressed responses. This breaks in
690+
# a variety of ways depending on how the server is configured.
691+
# - Some servers will notice that the file isn't a compressible
692+
# file and will leave the file alone and with an empty
693+
# Content-Encoding
694+
# - Some servers will notice that the file is already
695+
# compressed and will leave the file alone and will add a
696+
# Content-Encoding: gzip header
697+
# - Some servers won't notice anything at all and will take
698+
# a file that's already been compressed and compress it again
699+
# and set the Content-Encoding: gzip header
700+
# By setting this to request only the identity encoding We're
701+
# hoping to eliminate the third case. Hopefully there does not
702+
# exist a server which when given a file will notice it is
703+
# already compressed and that you're not asking for a
704+
# compressed file and will then decompress it before sending
705+
# because if that's the case I don't think it'll ever be
706+
# possible to make this work.
707+
headers={"Accept-Encoding": "identity"},
708+
stream=True,
709+
)
710+
resp.raise_for_status()
711+
except requests.HTTPError as exc:
712+
logger.fatal("HTTP error %s while getting %s" %
713+
(exc.response.status_code, link))
714+
raise
715+
716+
content_type = resp.headers.get('content-type', '')
717+
filename = link.filename # fallback
718+
# Have a look at the Content-Disposition header for a better guess
719+
content_disposition = resp.headers.get('content-disposition')
720+
if content_disposition:
721+
type, params = cgi.parse_header(content_disposition)
722+
# We use ``or`` here because we don't want to use an "empty" value
723+
# from the filename param.
724+
filename = params.get('filename') or filename
725+
ext = splitext(filename)[1]
726+
if not ext:
727+
ext = mimetypes.guess_extension(content_type)
728+
if ext:
729+
filename += ext
730+
if not ext and link.url != resp.url:
731+
ext = os.path.splitext(resp.url)[1]
732+
if ext:
733+
filename += ext
734+
file_path = os.path.join(temp_dir, filename)
735+
_download_url(resp, link, file_path)
736+
return file_path, content_type
737+
738+
739+
def _check_download_dir(link, download_dir):
740+
""" Check download_dir for previously downloaded file with correct hash
741+
If a correct file is found return its path else None
742+
"""
743+
download_path = os.path.join(download_dir, link.filename)
744+
if os.path.exists(download_path):
745+
# If already downloaded, does its hash match?
746+
logger.notify('File was already downloaded %s' % download_path)
747+
if link.hash:
748+
download_hash = _get_hash_from_file(download_path, link)
749+
try:
750+
_check_hash(download_hash, link)
751+
except HashMismatch:
752+
logger.warn(
753+
'Previously-downloaded file %s has bad hash, '
754+
're-downloading.' % download_path
755+
)
756+
os.unlink(download_path)
757+
return None
758+
return download_path
759+
return None

0 commit comments

Comments
 (0)