From 797119d9e3fdfc94f2f774fda54781ddcff7f8ec Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" Date: Thu, 19 Jan 2023 23:04:30 -0800 Subject: [PATCH 1/2] [3.11] gh-101144: Allow open and read_text encoding to be positional. (GH-101145) The zipfile.Path open() and read_text() encoding parameter can be supplied as a positional argument without causing a TypeError again. 3.10.0b1 included a regression that made it keyword only. Documentation update included as users writing code to be compatible with a wide range of versions will need to consider this for some time.. (cherry picked from commit 5927013e47a8c63b70e104152351f3447baa819c) Co-authored-by: Gregory P. Smith --- Doc/library/zipfile.rst | 12 + Lib/test/test_zipfile.py | 66 ++- Lib/test/test_zipfile/test_path.py | 495 ++++++++++++++++++ Lib/zipfile.py | 15 +- Lib/zipfile/_path.py | 338 ++++++++++++ ...-01-18-17-58-50.gh-issue-101144.FHd8Un.rst | 4 + 6 files changed, 924 insertions(+), 6 deletions(-) create mode 100644 Lib/test/test_zipfile/test_path.py create mode 100644 Lib/zipfile/_path.py create mode 100644 Misc/NEWS.d/next/Library/2023-01-18-17-58-50.gh-issue-101144.FHd8Un.rst diff --git a/Doc/library/zipfile.rst b/Doc/library/zipfile.rst index d02e32d320bc85..af1f26385553ab 100644 --- a/Doc/library/zipfile.rst +++ b/Doc/library/zipfile.rst @@ -551,6 +551,12 @@ Path objects are traversable using the ``/`` operator or ``joinpath``. Added support for text and binary modes for open. Default mode is now text. + .. versionchanged:: 3.11.2 + The ``encoding`` parameter can be supplied as a positional argument + without causing a :exc:`TypeError`. As it could in 3.9. Code needing to + be compatible with unpatched 3.10 and 3.11 versions must pass all + :class:`io.TextIOWrapper` arguments, ``encoding`` included, as keywords. + .. method:: Path.iterdir() Enumerate the children of the current directory. @@ -596,6 +602,12 @@ Path objects are traversable using the ``/`` operator or ``joinpath``. :class:`io.TextIOWrapper` (except ``buffer``, which is implied by the context). + .. versionchanged:: 3.11.2 + The ``encoding`` parameter can be supplied as a positional argument + without causing a :exc:`TypeError`. As it could in 3.9. Code needing to + be compatible with unpatched 3.10 and 3.11 versions must pass all + :class:`io.TextIOWrapper` arguments, ``encoding`` included, as keywords. + .. method:: Path.read_bytes() Read the current file as bytes. diff --git a/Lib/test/test_zipfile.py b/Lib/test/test_zipfile.py index fd25e5a800d74d..35d78d96a68e69 100644 --- a/Lib/test/test_zipfile.py +++ b/Lib/test/test_zipfile.py @@ -10,6 +10,7 @@ import struct import subprocess import sys +from test.support.script_helper import assert_python_ok import time import unittest import unittest.mock as mock @@ -3005,7 +3006,69 @@ def test_open(self, alpharep): a, b, g = root.iterdir() with a.open(encoding="utf-8") as strm: data = strm.read() - assert data == "content of a" + self.assertEqual(data, "content of a") + with a.open('r', "utf-8") as strm: # not a kw, no gh-101144 TypeError + data = strm.read() + self.assertEqual(data, "content of a") + + def test_open_encoding_utf16(self): + in_memory_file = io.BytesIO() + zf = zipfile.ZipFile(in_memory_file, "w") + zf.writestr("path/16.txt", "This was utf-16".encode("utf-16")) + zf.filename = "test_open_utf16.zip" + root = zipfile.Path(zf) + (path,) = root.iterdir() + u16 = path.joinpath("16.txt") + with u16.open('r', "utf-16") as strm: + data = strm.read() + self.assertEqual(data, "This was utf-16") + with u16.open(encoding="utf-16") as strm: + data = strm.read() + self.assertEqual(data, "This was utf-16") + + def test_open_encoding_errors(self): + in_memory_file = io.BytesIO() + zf = zipfile.ZipFile(in_memory_file, "w") + zf.writestr("path/bad-utf8.bin", b"invalid utf-8: \xff\xff.") + zf.filename = "test_read_text_encoding_errors.zip" + root = zipfile.Path(zf) + (path,) = root.iterdir() + u16 = path.joinpath("bad-utf8.bin") + + # encoding= as a positional argument for gh-101144. + data = u16.read_text("utf-8", errors="ignore") + self.assertEqual(data, "invalid utf-8: .") + with u16.open("r", "utf-8", errors="surrogateescape") as f: + self.assertEqual(f.read(), "invalid utf-8: \udcff\udcff.") + + # encoding= both positional and keyword is an error; gh-101144. + with self.assertRaisesRegex(TypeError, "encoding"): + data = u16.read_text("utf-8", encoding="utf-8") + + # both keyword arguments work. + with u16.open("r", encoding="utf-8", errors="strict") as f: + # error during decoding with wrong codec. + with self.assertRaises(UnicodeDecodeError): + f.read() + + def test_encoding_warnings(self): + """EncodingWarning must blame the read_text and open calls.""" + code = '''\ +import io, zipfile +with zipfile.ZipFile(io.BytesIO(), "w") as zf: + zf.filename = '' + zf.writestr("path/file.txt", b"Spanish Inquisition") + root = zipfile.Path(zf) + (path,) = root.iterdir() + file_path = path.joinpath("file.txt") + unused = file_path.read_text() # should warn + file_path.open("r").close() # should warn +''' + proc = assert_python_ok('-X', 'warn_default_encoding', '-c', code) + warnings = proc.err.splitlines() + self.assertEqual(len(warnings), 2, proc.err) + self.assertRegex(warnings[0], rb"^:8: EncodingWarning:") + self.assertRegex(warnings[1], rb"^:9: EncodingWarning:") def test_open_write(self): """ @@ -3047,6 +3110,7 @@ def test_read(self, alpharep): root = zipfile.Path(alpharep) a, b, g = root.iterdir() assert a.read_text(encoding="utf-8") == "content of a" + a.read_text("utf-8") # No positional arg TypeError per gh-101144. assert a.read_bytes() == b"content of a" @pass_alpharep diff --git a/Lib/test/test_zipfile/test_path.py b/Lib/test/test_zipfile/test_path.py new file mode 100644 index 00000000000000..3086fd2080a97d --- /dev/null +++ b/Lib/test/test_zipfile/test_path.py @@ -0,0 +1,495 @@ +import io +import itertools +import contextlib +import pathlib +import pickle +import string +from test.support.script_helper import assert_python_ok +import unittest +import zipfile + +from ._test_params import parameterize, Invoked +from ._functools import compose + + +from test.support.os_helper import temp_dir + + +# Poor man's technique to consume a (smallish) iterable. +consume = tuple + + +# from jaraco.itertools 5.0 +class jaraco: + class itertools: + class Counter: + def __init__(self, i): + self.count = 0 + self._orig_iter = iter(i) + + def __iter__(self): + return self + + def __next__(self): + result = next(self._orig_iter) + self.count += 1 + return result + + +def add_dirs(zf): + """ + Given a writable zip file zf, inject directory entries for + any directories implied by the presence of children. + """ + for name in zipfile.CompleteDirs._implied_dirs(zf.namelist()): + zf.writestr(name, b"") + return zf + + +def build_alpharep_fixture(): + """ + Create a zip file with this structure: + + . + ├── a.txt + ├── b + │ ├── c.txt + │ ├── d + │ │ └── e.txt + │ └── f.txt + └── g + └── h + └── i.txt + + This fixture has the following key characteristics: + + - a file at the root (a) + - a file two levels deep (b/d/e) + - multiple files in a directory (b/c, b/f) + - a directory containing only a directory (g/h) + + "alpha" because it uses alphabet + "rep" because it's a representative example + """ + data = io.BytesIO() + zf = zipfile.ZipFile(data, "w") + zf.writestr("a.txt", b"content of a") + zf.writestr("b/c.txt", b"content of c") + zf.writestr("b/d/e.txt", b"content of e") + zf.writestr("b/f.txt", b"content of f") + zf.writestr("g/h/i.txt", b"content of i") + zf.filename = "alpharep.zip" + return zf + + +alpharep_generators = [ + Invoked.wrap(build_alpharep_fixture), + Invoked.wrap(compose(add_dirs, build_alpharep_fixture)), +] + +pass_alpharep = parameterize(['alpharep'], alpharep_generators) + + +class TestPath(unittest.TestCase): + def setUp(self): + self.fixtures = contextlib.ExitStack() + self.addCleanup(self.fixtures.close) + + def zipfile_ondisk(self, alpharep): + tmpdir = pathlib.Path(self.fixtures.enter_context(temp_dir())) + buffer = alpharep.fp + alpharep.close() + path = tmpdir / alpharep.filename + with path.open("wb") as strm: + strm.write(buffer.getvalue()) + return path + + @pass_alpharep + def test_iterdir_and_types(self, alpharep): + root = zipfile.Path(alpharep) + assert root.is_dir() + a, b, g = root.iterdir() + assert a.is_file() + assert b.is_dir() + assert g.is_dir() + c, f, d = b.iterdir() + assert c.is_file() and f.is_file() + (e,) = d.iterdir() + assert e.is_file() + (h,) = g.iterdir() + (i,) = h.iterdir() + assert i.is_file() + + @pass_alpharep + def test_is_file_missing(self, alpharep): + root = zipfile.Path(alpharep) + assert not root.joinpath('missing.txt').is_file() + + @pass_alpharep + def test_iterdir_on_file(self, alpharep): + root = zipfile.Path(alpharep) + a, b, g = root.iterdir() + with self.assertRaises(ValueError): + a.iterdir() + + @pass_alpharep + def test_subdir_is_dir(self, alpharep): + root = zipfile.Path(alpharep) + assert (root / 'b').is_dir() + assert (root / 'b/').is_dir() + assert (root / 'g').is_dir() + assert (root / 'g/').is_dir() + + @pass_alpharep + def test_open(self, alpharep): + root = zipfile.Path(alpharep) + a, b, g = root.iterdir() + with a.open(encoding="utf-8") as strm: + data = strm.read() + self.assertEqual(data, "content of a") + with a.open('r', "utf-8") as strm: # not a kw, no gh-101144 TypeError + data = strm.read() + self.assertEqual(data, "content of a") + + def test_open_encoding_utf16(self): + in_memory_file = io.BytesIO() + zf = zipfile.ZipFile(in_memory_file, "w") + zf.writestr("path/16.txt", "This was utf-16".encode("utf-16")) + zf.filename = "test_open_utf16.zip" + root = zipfile.Path(zf) + (path,) = root.iterdir() + u16 = path.joinpath("16.txt") + with u16.open('r', "utf-16") as strm: + data = strm.read() + self.assertEqual(data, "This was utf-16") + with u16.open(encoding="utf-16") as strm: + data = strm.read() + self.assertEqual(data, "This was utf-16") + + def test_open_encoding_errors(self): + in_memory_file = io.BytesIO() + zf = zipfile.ZipFile(in_memory_file, "w") + zf.writestr("path/bad-utf8.bin", b"invalid utf-8: \xff\xff.") + zf.filename = "test_read_text_encoding_errors.zip" + root = zipfile.Path(zf) + (path,) = root.iterdir() + u16 = path.joinpath("bad-utf8.bin") + + # encoding= as a positional argument for gh-101144. + data = u16.read_text("utf-8", errors="ignore") + self.assertEqual(data, "invalid utf-8: .") + with u16.open("r", "utf-8", errors="surrogateescape") as f: + self.assertEqual(f.read(), "invalid utf-8: \udcff\udcff.") + + # encoding= both positional and keyword is an error; gh-101144. + with self.assertRaisesRegex(TypeError, "encoding"): + data = u16.read_text("utf-8", encoding="utf-8") + + # both keyword arguments work. + with u16.open("r", encoding="utf-8", errors="strict") as f: + # error during decoding with wrong codec. + with self.assertRaises(UnicodeDecodeError): + f.read() + + def test_encoding_warnings(self): + """EncodingWarning must blame the read_text and open calls.""" + code = '''\ +import io, zipfile +with zipfile.ZipFile(io.BytesIO(), "w") as zf: + zf.filename = '' + zf.writestr("path/file.txt", b"Spanish Inquisition") + root = zipfile.Path(zf) + (path,) = root.iterdir() + file_path = path.joinpath("file.txt") + unused = file_path.read_text() # should warn + file_path.open("r").close() # should warn +''' + proc = assert_python_ok('-X', 'warn_default_encoding', '-c', code) + warnings = proc.err.splitlines() + self.assertEqual(len(warnings), 2, proc.err) + self.assertRegex(warnings[0], rb"^:8: EncodingWarning:") + self.assertRegex(warnings[1], rb"^:9: EncodingWarning:") + + def test_open_write(self): + """ + If the zipfile is open for write, it should be possible to + write bytes or text to it. + """ + zf = zipfile.Path(zipfile.ZipFile(io.BytesIO(), mode='w')) + with zf.joinpath('file.bin').open('wb') as strm: + strm.write(b'binary contents') + with zf.joinpath('file.txt').open('w', encoding="utf-8") as strm: + strm.write('text file') + + def test_open_extant_directory(self): + """ + Attempting to open a directory raises IsADirectoryError. + """ + zf = zipfile.Path(add_dirs(build_alpharep_fixture())) + with self.assertRaises(IsADirectoryError): + zf.joinpath('b').open() + + @pass_alpharep + def test_open_binary_invalid_args(self, alpharep): + root = zipfile.Path(alpharep) + with self.assertRaises(ValueError): + root.joinpath('a.txt').open('rb', encoding='utf-8') + with self.assertRaises(ValueError): + root.joinpath('a.txt').open('rb', 'utf-8') + + def test_open_missing_directory(self): + """ + Attempting to open a missing directory raises FileNotFoundError. + """ + zf = zipfile.Path(add_dirs(build_alpharep_fixture())) + with self.assertRaises(FileNotFoundError): + zf.joinpath('z').open() + + @pass_alpharep + def test_read(self, alpharep): + root = zipfile.Path(alpharep) + a, b, g = root.iterdir() + assert a.read_text(encoding="utf-8") == "content of a" + a.read_text("utf-8") # No positional arg TypeError per gh-101144. + assert a.read_bytes() == b"content of a" + + @pass_alpharep + def test_joinpath(self, alpharep): + root = zipfile.Path(alpharep) + a = root.joinpath("a.txt") + assert a.is_file() + e = root.joinpath("b").joinpath("d").joinpath("e.txt") + assert e.read_text(encoding="utf-8") == "content of e" + + @pass_alpharep + def test_joinpath_multiple(self, alpharep): + root = zipfile.Path(alpharep) + e = root.joinpath("b", "d", "e.txt") + assert e.read_text(encoding="utf-8") == "content of e" + + @pass_alpharep + def test_traverse_truediv(self, alpharep): + root = zipfile.Path(alpharep) + a = root / "a.txt" + assert a.is_file() + e = root / "b" / "d" / "e.txt" + assert e.read_text(encoding="utf-8") == "content of e" + + @pass_alpharep + def test_traverse_simplediv(self, alpharep): + """ + Disable the __future__.division when testing traversal. + """ + code = compile( + source="zipfile.Path(alpharep) / 'a'", + filename="(test)", + mode="eval", + dont_inherit=True, + ) + eval(code) + + @pass_alpharep + def test_pathlike_construction(self, alpharep): + """ + zipfile.Path should be constructable from a path-like object + """ + zipfile_ondisk = self.zipfile_ondisk(alpharep) + pathlike = pathlib.Path(str(zipfile_ondisk)) + zipfile.Path(pathlike) + + @pass_alpharep + def test_traverse_pathlike(self, alpharep): + root = zipfile.Path(alpharep) + root / pathlib.Path("a") + + @pass_alpharep + def test_parent(self, alpharep): + root = zipfile.Path(alpharep) + assert (root / 'a').parent.at == '' + assert (root / 'a' / 'b').parent.at == 'a/' + + @pass_alpharep + def test_dir_parent(self, alpharep): + root = zipfile.Path(alpharep) + assert (root / 'b').parent.at == '' + assert (root / 'b/').parent.at == '' + + @pass_alpharep + def test_missing_dir_parent(self, alpharep): + root = zipfile.Path(alpharep) + assert (root / 'missing dir/').parent.at == '' + + @pass_alpharep + def test_mutability(self, alpharep): + """ + If the underlying zipfile is changed, the Path object should + reflect that change. + """ + root = zipfile.Path(alpharep) + a, b, g = root.iterdir() + alpharep.writestr('foo.txt', 'foo') + alpharep.writestr('bar/baz.txt', 'baz') + assert any(child.name == 'foo.txt' for child in root.iterdir()) + assert (root / 'foo.txt').read_text(encoding="utf-8") == 'foo' + (baz,) = (root / 'bar').iterdir() + assert baz.read_text(encoding="utf-8") == 'baz' + + HUGE_ZIPFILE_NUM_ENTRIES = 2**13 + + def huge_zipfile(self): + """Create a read-only zipfile with a huge number of entries entries.""" + strm = io.BytesIO() + zf = zipfile.ZipFile(strm, "w") + for entry in map(str, range(self.HUGE_ZIPFILE_NUM_ENTRIES)): + zf.writestr(entry, entry) + zf.mode = 'r' + return zf + + def test_joinpath_constant_time(self): + """ + Ensure joinpath on items in zipfile is linear time. + """ + root = zipfile.Path(self.huge_zipfile()) + entries = jaraco.itertools.Counter(root.iterdir()) + for entry in entries: + entry.joinpath('suffix') + # Check the file iterated all items + assert entries.count == self.HUGE_ZIPFILE_NUM_ENTRIES + + # @func_timeout.func_set_timeout(3) + def test_implied_dirs_performance(self): + data = ['/'.join(string.ascii_lowercase + str(n)) for n in range(10000)] + zipfile.CompleteDirs._implied_dirs(data) + + @pass_alpharep + def test_read_does_not_close(self, alpharep): + alpharep = self.zipfile_ondisk(alpharep) + with zipfile.ZipFile(alpharep) as file: + for rep in range(2): + zipfile.Path(file, 'a.txt').read_text(encoding="utf-8") + + @pass_alpharep + def test_subclass(self, alpharep): + class Subclass(zipfile.Path): + pass + + root = Subclass(alpharep) + assert isinstance(root / 'b', Subclass) + + @pass_alpharep + def test_filename(self, alpharep): + root = zipfile.Path(alpharep) + assert root.filename == pathlib.Path('alpharep.zip') + + @pass_alpharep + def test_root_name(self, alpharep): + """ + The name of the root should be the name of the zipfile + """ + root = zipfile.Path(alpharep) + assert root.name == 'alpharep.zip' == root.filename.name + + @pass_alpharep + def test_suffix(self, alpharep): + """ + The suffix of the root should be the suffix of the zipfile. + The suffix of each nested file is the final component's last suffix, if any. + Includes the leading period, just like pathlib.Path. + """ + root = zipfile.Path(alpharep) + assert root.suffix == '.zip' == root.filename.suffix + + b = root / "b.txt" + assert b.suffix == ".txt" + + c = root / "c" / "filename.tar.gz" + assert c.suffix == ".gz" + + d = root / "d" + assert d.suffix == "" + + @pass_alpharep + def test_suffixes(self, alpharep): + """ + The suffix of the root should be the suffix of the zipfile. + The suffix of each nested file is the final component's last suffix, if any. + Includes the leading period, just like pathlib.Path. + """ + root = zipfile.Path(alpharep) + assert root.suffixes == ['.zip'] == root.filename.suffixes + + b = root / 'b.txt' + assert b.suffixes == ['.txt'] + + c = root / 'c' / 'filename.tar.gz' + assert c.suffixes == ['.tar', '.gz'] + + d = root / 'd' + assert d.suffixes == [] + + e = root / '.hgrc' + assert e.suffixes == [] + + @pass_alpharep + def test_stem(self, alpharep): + """ + The final path component, without its suffix + """ + root = zipfile.Path(alpharep) + assert root.stem == 'alpharep' == root.filename.stem + + b = root / "b.txt" + assert b.stem == "b" + + c = root / "c" / "filename.tar.gz" + assert c.stem == "filename.tar" + + d = root / "d" + assert d.stem == "d" + + @pass_alpharep + def test_root_parent(self, alpharep): + root = zipfile.Path(alpharep) + assert root.parent == pathlib.Path('.') + root.root.filename = 'foo/bar.zip' + assert root.parent == pathlib.Path('foo') + + @pass_alpharep + def test_root_unnamed(self, alpharep): + """ + It is an error to attempt to get the name + or parent of an unnamed zipfile. + """ + alpharep.filename = None + root = zipfile.Path(alpharep) + with self.assertRaises(TypeError): + root.name + with self.assertRaises(TypeError): + root.parent + + # .name and .parent should still work on subs + sub = root / "b" + assert sub.name == "b" + assert sub.parent + + @pass_alpharep + def test_inheritance(self, alpharep): + cls = type('PathChild', (zipfile.Path,), {}) + file = cls(alpharep).joinpath('some dir').parent + assert isinstance(file, cls) + + @parameterize( + ['alpharep', 'path_type', 'subpath'], + itertools.product( + alpharep_generators, + [str, pathlib.Path], + ['', 'b/'], + ), + ) + def test_pickle(self, alpharep, path_type, subpath): + zipfile_ondisk = path_type(self.zipfile_ondisk(alpharep)) + + saved_1 = pickle.dumps(zipfile.Path(zipfile_ondisk, at=subpath)) + restored_1 = pickle.loads(saved_1) + first, *rest = restored_1.iterdir() + assert first.read_text().startswith('content of ') diff --git a/Lib/zipfile.py b/Lib/zipfile.py index de667a7efeebc9..f69a43b764ef38 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -2287,6 +2287,11 @@ def _name_set(self): return self.__lookup +def _extract_text_encoding(encoding=None, *args, **kwargs): + # stacklevel=3 so that the caller of the caller see any warning. + return io.text_encoding(encoding, 3), args, kwargs + + class Path: """ A pathlib-compatible interface for zip files. @@ -2396,9 +2401,9 @@ def open(self, mode='r', *args, pwd=None, **kwargs): if args or kwargs: raise ValueError("encoding args invalid for binary operation") return stream - else: - kwargs["encoding"] = io.text_encoding(kwargs.get("encoding")) - return io.TextIOWrapper(stream, *args, **kwargs) + # Text mode: + encoding, args, kwargs = _extract_text_encoding(*args, **kwargs) + return io.TextIOWrapper(stream, encoding, *args, **kwargs) @property def name(self): @@ -2421,8 +2426,8 @@ def filename(self): return pathlib.Path(self.root.filename).joinpath(self.at) def read_text(self, *args, **kwargs): - kwargs["encoding"] = io.text_encoding(kwargs.get("encoding")) - with self.open('r', *args, **kwargs) as strm: + encoding, args, kwargs = _extract_text_encoding(*args, **kwargs) + with self.open('r', encoding, *args, **kwargs) as strm: return strm.read() def read_bytes(self): diff --git a/Lib/zipfile/_path.py b/Lib/zipfile/_path.py new file mode 100644 index 00000000000000..7c7a6a0e2c0d32 --- /dev/null +++ b/Lib/zipfile/_path.py @@ -0,0 +1,338 @@ +import io +import posixpath +import zipfile +import itertools +import contextlib +import pathlib + + +__all__ = ['Path'] + + +def _parents(path): + """ + Given a path with elements separated by + posixpath.sep, generate all parents of that path. + + >>> list(_parents('b/d')) + ['b'] + >>> list(_parents('/b/d/')) + ['/b'] + >>> list(_parents('b/d/f/')) + ['b/d', 'b'] + >>> list(_parents('b')) + [] + >>> list(_parents('')) + [] + """ + return itertools.islice(_ancestry(path), 1, None) + + +def _ancestry(path): + """ + Given a path with elements separated by + posixpath.sep, generate all elements of that path + + >>> list(_ancestry('b/d')) + ['b/d', 'b'] + >>> list(_ancestry('/b/d/')) + ['/b/d', '/b'] + >>> list(_ancestry('b/d/f/')) + ['b/d/f', 'b/d', 'b'] + >>> list(_ancestry('b')) + ['b'] + >>> list(_ancestry('')) + [] + """ + path = path.rstrip(posixpath.sep) + while path and path != posixpath.sep: + yield path + path, tail = posixpath.split(path) + + +_dedupe = dict.fromkeys +"""Deduplicate an iterable in original order""" + + +def _difference(minuend, subtrahend): + """ + Return items in minuend not in subtrahend, retaining order + with O(1) lookup. + """ + return itertools.filterfalse(set(subtrahend).__contains__, minuend) + + +class InitializedState: + """ + Mix-in to save the initialization state for pickling. + """ + + def __init__(self, *args, **kwargs): + self.__args = args + self.__kwargs = kwargs + super().__init__(*args, **kwargs) + + def __getstate__(self): + return self.__args, self.__kwargs + + def __setstate__(self, state): + args, kwargs = state + super().__init__(*args, **kwargs) + + +class CompleteDirs(InitializedState, zipfile.ZipFile): + """ + A ZipFile subclass that ensures that implied directories + are always included in the namelist. + """ + + @staticmethod + def _implied_dirs(names): + parents = itertools.chain.from_iterable(map(_parents, names)) + as_dirs = (p + posixpath.sep for p in parents) + return _dedupe(_difference(as_dirs, names)) + + def namelist(self): + names = super(CompleteDirs, self).namelist() + return names + list(self._implied_dirs(names)) + + def _name_set(self): + return set(self.namelist()) + + def resolve_dir(self, name): + """ + If the name represents a directory, return that name + as a directory (with the trailing slash). + """ + names = self._name_set() + dirname = name + '/' + dir_match = name not in names and dirname in names + return dirname if dir_match else name + + @classmethod + def make(cls, source): + """ + Given a source (filename or zipfile), return an + appropriate CompleteDirs subclass. + """ + if isinstance(source, CompleteDirs): + return source + + if not isinstance(source, zipfile.ZipFile): + return cls(source) + + # Only allow for FastLookup when supplied zipfile is read-only + if 'r' not in source.mode: + cls = CompleteDirs + + source.__class__ = cls + return source + + +class FastLookup(CompleteDirs): + """ + ZipFile subclass to ensure implicit + dirs exist and are resolved rapidly. + """ + + def namelist(self): + with contextlib.suppress(AttributeError): + return self.__names + self.__names = super(FastLookup, self).namelist() + return self.__names + + def _name_set(self): + with contextlib.suppress(AttributeError): + return self.__lookup + self.__lookup = super(FastLookup, self)._name_set() + return self.__lookup + + +def _extract_text_encoding(encoding=None, *args, **kwargs): + # stacklevel=3 so that the caller of the caller see any warning. + return io.text_encoding(encoding, 3), args, kwargs + + +class Path: + """ + A pathlib-compatible interface for zip files. + + Consider a zip file with this structure:: + + . + ├── a.txt + └── b + ├── c.txt + └── d + └── e.txt + + >>> data = io.BytesIO() + >>> zf = ZipFile(data, 'w') + >>> zf.writestr('a.txt', 'content of a') + >>> zf.writestr('b/c.txt', 'content of c') + >>> zf.writestr('b/d/e.txt', 'content of e') + >>> zf.filename = 'mem/abcde.zip' + + Path accepts the zipfile object itself or a filename + + >>> root = Path(zf) + + From there, several path operations are available. + + Directory iteration (including the zip file itself): + + >>> a, b = root.iterdir() + >>> a + Path('mem/abcde.zip', 'a.txt') + >>> b + Path('mem/abcde.zip', 'b/') + + name property: + + >>> b.name + 'b' + + join with divide operator: + + >>> c = b / 'c.txt' + >>> c + Path('mem/abcde.zip', 'b/c.txt') + >>> c.name + 'c.txt' + + Read text: + + >>> c.read_text() + 'content of c' + + existence: + + >>> c.exists() + True + >>> (b / 'missing.txt').exists() + False + + Coercion to string: + + >>> import os + >>> str(c).replace(os.sep, posixpath.sep) + 'mem/abcde.zip/b/c.txt' + + At the root, ``name``, ``filename``, and ``parent`` + resolve to the zipfile. Note these attributes are not + valid and will raise a ``ValueError`` if the zipfile + has no filename. + + >>> root.name + 'abcde.zip' + >>> str(root.filename).replace(os.sep, posixpath.sep) + 'mem/abcde.zip' + >>> str(root.parent) + 'mem' + """ + + __repr = "{self.__class__.__name__}({self.root.filename!r}, {self.at!r})" + + def __init__(self, root, at=""): + """ + Construct a Path from a ZipFile or filename. + + Note: When the source is an existing ZipFile object, + its type (__class__) will be mutated to a + specialized type. If the caller wishes to retain the + original type, the caller should either create a + separate ZipFile object or pass a filename. + """ + self.root = FastLookup.make(root) + self.at = at + + def open(self, mode='r', *args, pwd=None, **kwargs): + """ + Open this entry as text or binary following the semantics + of ``pathlib.Path.open()`` by passing arguments through + to io.TextIOWrapper(). + """ + if self.is_dir(): + raise IsADirectoryError(self) + zip_mode = mode[0] + if not self.exists() and zip_mode == 'r': + raise FileNotFoundError(self) + stream = self.root.open(self.at, zip_mode, pwd=pwd) + if 'b' in mode: + if args or kwargs: + raise ValueError("encoding args invalid for binary operation") + return stream + # Text mode: + encoding, args, kwargs = _extract_text_encoding(*args, **kwargs) + return io.TextIOWrapper(stream, encoding, *args, **kwargs) + + @property + def name(self): + return pathlib.Path(self.at).name or self.filename.name + + @property + def suffix(self): + return pathlib.Path(self.at).suffix or self.filename.suffix + + @property + def suffixes(self): + return pathlib.Path(self.at).suffixes or self.filename.suffixes + + @property + def stem(self): + return pathlib.Path(self.at).stem or self.filename.stem + + @property + def filename(self): + return pathlib.Path(self.root.filename).joinpath(self.at) + + def read_text(self, *args, **kwargs): + encoding, args, kwargs = _extract_text_encoding(*args, **kwargs) + with self.open('r', encoding, *args, **kwargs) as strm: + return strm.read() + + def read_bytes(self): + with self.open('rb') as strm: + return strm.read() + + def _is_child(self, path): + return posixpath.dirname(path.at.rstrip("/")) == self.at.rstrip("/") + + def _next(self, at): + return self.__class__(self.root, at) + + def is_dir(self): + return not self.at or self.at.endswith("/") + + def is_file(self): + return self.exists() and not self.is_dir() + + def exists(self): + return self.at in self.root._name_set() + + def iterdir(self): + if not self.is_dir(): + raise ValueError("Can't listdir a file") + subs = map(self._next, self.root.namelist()) + return filter(self._is_child, subs) + + def __str__(self): + return posixpath.join(self.root.filename, self.at) + + def __repr__(self): + return self.__repr.format(self=self) + + def joinpath(self, *other): + next = posixpath.join(self.at, *other) + return self._next(self.root.resolve_dir(next)) + + __truediv__ = joinpath + + @property + def parent(self): + if not self.at: + return self.filename.parent + parent_at = posixpath.dirname(self.at.rstrip('/')) + if parent_at: + parent_at += '/' + return self._next(parent_at) diff --git a/Misc/NEWS.d/next/Library/2023-01-18-17-58-50.gh-issue-101144.FHd8Un.rst b/Misc/NEWS.d/next/Library/2023-01-18-17-58-50.gh-issue-101144.FHd8Un.rst new file mode 100644 index 00000000000000..297652259949fc --- /dev/null +++ b/Misc/NEWS.d/next/Library/2023-01-18-17-58-50.gh-issue-101144.FHd8Un.rst @@ -0,0 +1,4 @@ +Make :func:`zipfile.Path.open` and :func:`zipfile.Path.read_text` also accept +``encoding`` as a positional argument. This was the behavior in Python 3.9 and +earlier. 3.10 introduced a regression where supplying it as a positional +argument would lead to a :exc:`TypeError`. From eac75a7a9350295822e2503a84c5f000771bc737 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith [Google LLC]" Date: Thu, 19 Jan 2023 23:19:28 -0800 Subject: [PATCH 2/2] remove the added .py files, cherrypick patch mess. --- Lib/test/test_zipfile/test_path.py | 495 ----------------------------- Lib/zipfile/_path.py | 338 -------------------- 2 files changed, 833 deletions(-) delete mode 100644 Lib/test/test_zipfile/test_path.py delete mode 100644 Lib/zipfile/_path.py diff --git a/Lib/test/test_zipfile/test_path.py b/Lib/test/test_zipfile/test_path.py deleted file mode 100644 index 3086fd2080a97d..00000000000000 --- a/Lib/test/test_zipfile/test_path.py +++ /dev/null @@ -1,495 +0,0 @@ -import io -import itertools -import contextlib -import pathlib -import pickle -import string -from test.support.script_helper import assert_python_ok -import unittest -import zipfile - -from ._test_params import parameterize, Invoked -from ._functools import compose - - -from test.support.os_helper import temp_dir - - -# Poor man's technique to consume a (smallish) iterable. -consume = tuple - - -# from jaraco.itertools 5.0 -class jaraco: - class itertools: - class Counter: - def __init__(self, i): - self.count = 0 - self._orig_iter = iter(i) - - def __iter__(self): - return self - - def __next__(self): - result = next(self._orig_iter) - self.count += 1 - return result - - -def add_dirs(zf): - """ - Given a writable zip file zf, inject directory entries for - any directories implied by the presence of children. - """ - for name in zipfile.CompleteDirs._implied_dirs(zf.namelist()): - zf.writestr(name, b"") - return zf - - -def build_alpharep_fixture(): - """ - Create a zip file with this structure: - - . - ├── a.txt - ├── b - │ ├── c.txt - │ ├── d - │ │ └── e.txt - │ └── f.txt - └── g - └── h - └── i.txt - - This fixture has the following key characteristics: - - - a file at the root (a) - - a file two levels deep (b/d/e) - - multiple files in a directory (b/c, b/f) - - a directory containing only a directory (g/h) - - "alpha" because it uses alphabet - "rep" because it's a representative example - """ - data = io.BytesIO() - zf = zipfile.ZipFile(data, "w") - zf.writestr("a.txt", b"content of a") - zf.writestr("b/c.txt", b"content of c") - zf.writestr("b/d/e.txt", b"content of e") - zf.writestr("b/f.txt", b"content of f") - zf.writestr("g/h/i.txt", b"content of i") - zf.filename = "alpharep.zip" - return zf - - -alpharep_generators = [ - Invoked.wrap(build_alpharep_fixture), - Invoked.wrap(compose(add_dirs, build_alpharep_fixture)), -] - -pass_alpharep = parameterize(['alpharep'], alpharep_generators) - - -class TestPath(unittest.TestCase): - def setUp(self): - self.fixtures = contextlib.ExitStack() - self.addCleanup(self.fixtures.close) - - def zipfile_ondisk(self, alpharep): - tmpdir = pathlib.Path(self.fixtures.enter_context(temp_dir())) - buffer = alpharep.fp - alpharep.close() - path = tmpdir / alpharep.filename - with path.open("wb") as strm: - strm.write(buffer.getvalue()) - return path - - @pass_alpharep - def test_iterdir_and_types(self, alpharep): - root = zipfile.Path(alpharep) - assert root.is_dir() - a, b, g = root.iterdir() - assert a.is_file() - assert b.is_dir() - assert g.is_dir() - c, f, d = b.iterdir() - assert c.is_file() and f.is_file() - (e,) = d.iterdir() - assert e.is_file() - (h,) = g.iterdir() - (i,) = h.iterdir() - assert i.is_file() - - @pass_alpharep - def test_is_file_missing(self, alpharep): - root = zipfile.Path(alpharep) - assert not root.joinpath('missing.txt').is_file() - - @pass_alpharep - def test_iterdir_on_file(self, alpharep): - root = zipfile.Path(alpharep) - a, b, g = root.iterdir() - with self.assertRaises(ValueError): - a.iterdir() - - @pass_alpharep - def test_subdir_is_dir(self, alpharep): - root = zipfile.Path(alpharep) - assert (root / 'b').is_dir() - assert (root / 'b/').is_dir() - assert (root / 'g').is_dir() - assert (root / 'g/').is_dir() - - @pass_alpharep - def test_open(self, alpharep): - root = zipfile.Path(alpharep) - a, b, g = root.iterdir() - with a.open(encoding="utf-8") as strm: - data = strm.read() - self.assertEqual(data, "content of a") - with a.open('r', "utf-8") as strm: # not a kw, no gh-101144 TypeError - data = strm.read() - self.assertEqual(data, "content of a") - - def test_open_encoding_utf16(self): - in_memory_file = io.BytesIO() - zf = zipfile.ZipFile(in_memory_file, "w") - zf.writestr("path/16.txt", "This was utf-16".encode("utf-16")) - zf.filename = "test_open_utf16.zip" - root = zipfile.Path(zf) - (path,) = root.iterdir() - u16 = path.joinpath("16.txt") - with u16.open('r', "utf-16") as strm: - data = strm.read() - self.assertEqual(data, "This was utf-16") - with u16.open(encoding="utf-16") as strm: - data = strm.read() - self.assertEqual(data, "This was utf-16") - - def test_open_encoding_errors(self): - in_memory_file = io.BytesIO() - zf = zipfile.ZipFile(in_memory_file, "w") - zf.writestr("path/bad-utf8.bin", b"invalid utf-8: \xff\xff.") - zf.filename = "test_read_text_encoding_errors.zip" - root = zipfile.Path(zf) - (path,) = root.iterdir() - u16 = path.joinpath("bad-utf8.bin") - - # encoding= as a positional argument for gh-101144. - data = u16.read_text("utf-8", errors="ignore") - self.assertEqual(data, "invalid utf-8: .") - with u16.open("r", "utf-8", errors="surrogateescape") as f: - self.assertEqual(f.read(), "invalid utf-8: \udcff\udcff.") - - # encoding= both positional and keyword is an error; gh-101144. - with self.assertRaisesRegex(TypeError, "encoding"): - data = u16.read_text("utf-8", encoding="utf-8") - - # both keyword arguments work. - with u16.open("r", encoding="utf-8", errors="strict") as f: - # error during decoding with wrong codec. - with self.assertRaises(UnicodeDecodeError): - f.read() - - def test_encoding_warnings(self): - """EncodingWarning must blame the read_text and open calls.""" - code = '''\ -import io, zipfile -with zipfile.ZipFile(io.BytesIO(), "w") as zf: - zf.filename = '' - zf.writestr("path/file.txt", b"Spanish Inquisition") - root = zipfile.Path(zf) - (path,) = root.iterdir() - file_path = path.joinpath("file.txt") - unused = file_path.read_text() # should warn - file_path.open("r").close() # should warn -''' - proc = assert_python_ok('-X', 'warn_default_encoding', '-c', code) - warnings = proc.err.splitlines() - self.assertEqual(len(warnings), 2, proc.err) - self.assertRegex(warnings[0], rb"^:8: EncodingWarning:") - self.assertRegex(warnings[1], rb"^:9: EncodingWarning:") - - def test_open_write(self): - """ - If the zipfile is open for write, it should be possible to - write bytes or text to it. - """ - zf = zipfile.Path(zipfile.ZipFile(io.BytesIO(), mode='w')) - with zf.joinpath('file.bin').open('wb') as strm: - strm.write(b'binary contents') - with zf.joinpath('file.txt').open('w', encoding="utf-8") as strm: - strm.write('text file') - - def test_open_extant_directory(self): - """ - Attempting to open a directory raises IsADirectoryError. - """ - zf = zipfile.Path(add_dirs(build_alpharep_fixture())) - with self.assertRaises(IsADirectoryError): - zf.joinpath('b').open() - - @pass_alpharep - def test_open_binary_invalid_args(self, alpharep): - root = zipfile.Path(alpharep) - with self.assertRaises(ValueError): - root.joinpath('a.txt').open('rb', encoding='utf-8') - with self.assertRaises(ValueError): - root.joinpath('a.txt').open('rb', 'utf-8') - - def test_open_missing_directory(self): - """ - Attempting to open a missing directory raises FileNotFoundError. - """ - zf = zipfile.Path(add_dirs(build_alpharep_fixture())) - with self.assertRaises(FileNotFoundError): - zf.joinpath('z').open() - - @pass_alpharep - def test_read(self, alpharep): - root = zipfile.Path(alpharep) - a, b, g = root.iterdir() - assert a.read_text(encoding="utf-8") == "content of a" - a.read_text("utf-8") # No positional arg TypeError per gh-101144. - assert a.read_bytes() == b"content of a" - - @pass_alpharep - def test_joinpath(self, alpharep): - root = zipfile.Path(alpharep) - a = root.joinpath("a.txt") - assert a.is_file() - e = root.joinpath("b").joinpath("d").joinpath("e.txt") - assert e.read_text(encoding="utf-8") == "content of e" - - @pass_alpharep - def test_joinpath_multiple(self, alpharep): - root = zipfile.Path(alpharep) - e = root.joinpath("b", "d", "e.txt") - assert e.read_text(encoding="utf-8") == "content of e" - - @pass_alpharep - def test_traverse_truediv(self, alpharep): - root = zipfile.Path(alpharep) - a = root / "a.txt" - assert a.is_file() - e = root / "b" / "d" / "e.txt" - assert e.read_text(encoding="utf-8") == "content of e" - - @pass_alpharep - def test_traverse_simplediv(self, alpharep): - """ - Disable the __future__.division when testing traversal. - """ - code = compile( - source="zipfile.Path(alpharep) / 'a'", - filename="(test)", - mode="eval", - dont_inherit=True, - ) - eval(code) - - @pass_alpharep - def test_pathlike_construction(self, alpharep): - """ - zipfile.Path should be constructable from a path-like object - """ - zipfile_ondisk = self.zipfile_ondisk(alpharep) - pathlike = pathlib.Path(str(zipfile_ondisk)) - zipfile.Path(pathlike) - - @pass_alpharep - def test_traverse_pathlike(self, alpharep): - root = zipfile.Path(alpharep) - root / pathlib.Path("a") - - @pass_alpharep - def test_parent(self, alpharep): - root = zipfile.Path(alpharep) - assert (root / 'a').parent.at == '' - assert (root / 'a' / 'b').parent.at == 'a/' - - @pass_alpharep - def test_dir_parent(self, alpharep): - root = zipfile.Path(alpharep) - assert (root / 'b').parent.at == '' - assert (root / 'b/').parent.at == '' - - @pass_alpharep - def test_missing_dir_parent(self, alpharep): - root = zipfile.Path(alpharep) - assert (root / 'missing dir/').parent.at == '' - - @pass_alpharep - def test_mutability(self, alpharep): - """ - If the underlying zipfile is changed, the Path object should - reflect that change. - """ - root = zipfile.Path(alpharep) - a, b, g = root.iterdir() - alpharep.writestr('foo.txt', 'foo') - alpharep.writestr('bar/baz.txt', 'baz') - assert any(child.name == 'foo.txt' for child in root.iterdir()) - assert (root / 'foo.txt').read_text(encoding="utf-8") == 'foo' - (baz,) = (root / 'bar').iterdir() - assert baz.read_text(encoding="utf-8") == 'baz' - - HUGE_ZIPFILE_NUM_ENTRIES = 2**13 - - def huge_zipfile(self): - """Create a read-only zipfile with a huge number of entries entries.""" - strm = io.BytesIO() - zf = zipfile.ZipFile(strm, "w") - for entry in map(str, range(self.HUGE_ZIPFILE_NUM_ENTRIES)): - zf.writestr(entry, entry) - zf.mode = 'r' - return zf - - def test_joinpath_constant_time(self): - """ - Ensure joinpath on items in zipfile is linear time. - """ - root = zipfile.Path(self.huge_zipfile()) - entries = jaraco.itertools.Counter(root.iterdir()) - for entry in entries: - entry.joinpath('suffix') - # Check the file iterated all items - assert entries.count == self.HUGE_ZIPFILE_NUM_ENTRIES - - # @func_timeout.func_set_timeout(3) - def test_implied_dirs_performance(self): - data = ['/'.join(string.ascii_lowercase + str(n)) for n in range(10000)] - zipfile.CompleteDirs._implied_dirs(data) - - @pass_alpharep - def test_read_does_not_close(self, alpharep): - alpharep = self.zipfile_ondisk(alpharep) - with zipfile.ZipFile(alpharep) as file: - for rep in range(2): - zipfile.Path(file, 'a.txt').read_text(encoding="utf-8") - - @pass_alpharep - def test_subclass(self, alpharep): - class Subclass(zipfile.Path): - pass - - root = Subclass(alpharep) - assert isinstance(root / 'b', Subclass) - - @pass_alpharep - def test_filename(self, alpharep): - root = zipfile.Path(alpharep) - assert root.filename == pathlib.Path('alpharep.zip') - - @pass_alpharep - def test_root_name(self, alpharep): - """ - The name of the root should be the name of the zipfile - """ - root = zipfile.Path(alpharep) - assert root.name == 'alpharep.zip' == root.filename.name - - @pass_alpharep - def test_suffix(self, alpharep): - """ - The suffix of the root should be the suffix of the zipfile. - The suffix of each nested file is the final component's last suffix, if any. - Includes the leading period, just like pathlib.Path. - """ - root = zipfile.Path(alpharep) - assert root.suffix == '.zip' == root.filename.suffix - - b = root / "b.txt" - assert b.suffix == ".txt" - - c = root / "c" / "filename.tar.gz" - assert c.suffix == ".gz" - - d = root / "d" - assert d.suffix == "" - - @pass_alpharep - def test_suffixes(self, alpharep): - """ - The suffix of the root should be the suffix of the zipfile. - The suffix of each nested file is the final component's last suffix, if any. - Includes the leading period, just like pathlib.Path. - """ - root = zipfile.Path(alpharep) - assert root.suffixes == ['.zip'] == root.filename.suffixes - - b = root / 'b.txt' - assert b.suffixes == ['.txt'] - - c = root / 'c' / 'filename.tar.gz' - assert c.suffixes == ['.tar', '.gz'] - - d = root / 'd' - assert d.suffixes == [] - - e = root / '.hgrc' - assert e.suffixes == [] - - @pass_alpharep - def test_stem(self, alpharep): - """ - The final path component, without its suffix - """ - root = zipfile.Path(alpharep) - assert root.stem == 'alpharep' == root.filename.stem - - b = root / "b.txt" - assert b.stem == "b" - - c = root / "c" / "filename.tar.gz" - assert c.stem == "filename.tar" - - d = root / "d" - assert d.stem == "d" - - @pass_alpharep - def test_root_parent(self, alpharep): - root = zipfile.Path(alpharep) - assert root.parent == pathlib.Path('.') - root.root.filename = 'foo/bar.zip' - assert root.parent == pathlib.Path('foo') - - @pass_alpharep - def test_root_unnamed(self, alpharep): - """ - It is an error to attempt to get the name - or parent of an unnamed zipfile. - """ - alpharep.filename = None - root = zipfile.Path(alpharep) - with self.assertRaises(TypeError): - root.name - with self.assertRaises(TypeError): - root.parent - - # .name and .parent should still work on subs - sub = root / "b" - assert sub.name == "b" - assert sub.parent - - @pass_alpharep - def test_inheritance(self, alpharep): - cls = type('PathChild', (zipfile.Path,), {}) - file = cls(alpharep).joinpath('some dir').parent - assert isinstance(file, cls) - - @parameterize( - ['alpharep', 'path_type', 'subpath'], - itertools.product( - alpharep_generators, - [str, pathlib.Path], - ['', 'b/'], - ), - ) - def test_pickle(self, alpharep, path_type, subpath): - zipfile_ondisk = path_type(self.zipfile_ondisk(alpharep)) - - saved_1 = pickle.dumps(zipfile.Path(zipfile_ondisk, at=subpath)) - restored_1 = pickle.loads(saved_1) - first, *rest = restored_1.iterdir() - assert first.read_text().startswith('content of ') diff --git a/Lib/zipfile/_path.py b/Lib/zipfile/_path.py deleted file mode 100644 index 7c7a6a0e2c0d32..00000000000000 --- a/Lib/zipfile/_path.py +++ /dev/null @@ -1,338 +0,0 @@ -import io -import posixpath -import zipfile -import itertools -import contextlib -import pathlib - - -__all__ = ['Path'] - - -def _parents(path): - """ - Given a path with elements separated by - posixpath.sep, generate all parents of that path. - - >>> list(_parents('b/d')) - ['b'] - >>> list(_parents('/b/d/')) - ['/b'] - >>> list(_parents('b/d/f/')) - ['b/d', 'b'] - >>> list(_parents('b')) - [] - >>> list(_parents('')) - [] - """ - return itertools.islice(_ancestry(path), 1, None) - - -def _ancestry(path): - """ - Given a path with elements separated by - posixpath.sep, generate all elements of that path - - >>> list(_ancestry('b/d')) - ['b/d', 'b'] - >>> list(_ancestry('/b/d/')) - ['/b/d', '/b'] - >>> list(_ancestry('b/d/f/')) - ['b/d/f', 'b/d', 'b'] - >>> list(_ancestry('b')) - ['b'] - >>> list(_ancestry('')) - [] - """ - path = path.rstrip(posixpath.sep) - while path and path != posixpath.sep: - yield path - path, tail = posixpath.split(path) - - -_dedupe = dict.fromkeys -"""Deduplicate an iterable in original order""" - - -def _difference(minuend, subtrahend): - """ - Return items in minuend not in subtrahend, retaining order - with O(1) lookup. - """ - return itertools.filterfalse(set(subtrahend).__contains__, minuend) - - -class InitializedState: - """ - Mix-in to save the initialization state for pickling. - """ - - def __init__(self, *args, **kwargs): - self.__args = args - self.__kwargs = kwargs - super().__init__(*args, **kwargs) - - def __getstate__(self): - return self.__args, self.__kwargs - - def __setstate__(self, state): - args, kwargs = state - super().__init__(*args, **kwargs) - - -class CompleteDirs(InitializedState, zipfile.ZipFile): - """ - A ZipFile subclass that ensures that implied directories - are always included in the namelist. - """ - - @staticmethod - def _implied_dirs(names): - parents = itertools.chain.from_iterable(map(_parents, names)) - as_dirs = (p + posixpath.sep for p in parents) - return _dedupe(_difference(as_dirs, names)) - - def namelist(self): - names = super(CompleteDirs, self).namelist() - return names + list(self._implied_dirs(names)) - - def _name_set(self): - return set(self.namelist()) - - def resolve_dir(self, name): - """ - If the name represents a directory, return that name - as a directory (with the trailing slash). - """ - names = self._name_set() - dirname = name + '/' - dir_match = name not in names and dirname in names - return dirname if dir_match else name - - @classmethod - def make(cls, source): - """ - Given a source (filename or zipfile), return an - appropriate CompleteDirs subclass. - """ - if isinstance(source, CompleteDirs): - return source - - if not isinstance(source, zipfile.ZipFile): - return cls(source) - - # Only allow for FastLookup when supplied zipfile is read-only - if 'r' not in source.mode: - cls = CompleteDirs - - source.__class__ = cls - return source - - -class FastLookup(CompleteDirs): - """ - ZipFile subclass to ensure implicit - dirs exist and are resolved rapidly. - """ - - def namelist(self): - with contextlib.suppress(AttributeError): - return self.__names - self.__names = super(FastLookup, self).namelist() - return self.__names - - def _name_set(self): - with contextlib.suppress(AttributeError): - return self.__lookup - self.__lookup = super(FastLookup, self)._name_set() - return self.__lookup - - -def _extract_text_encoding(encoding=None, *args, **kwargs): - # stacklevel=3 so that the caller of the caller see any warning. - return io.text_encoding(encoding, 3), args, kwargs - - -class Path: - """ - A pathlib-compatible interface for zip files. - - Consider a zip file with this structure:: - - . - ├── a.txt - └── b - ├── c.txt - └── d - └── e.txt - - >>> data = io.BytesIO() - >>> zf = ZipFile(data, 'w') - >>> zf.writestr('a.txt', 'content of a') - >>> zf.writestr('b/c.txt', 'content of c') - >>> zf.writestr('b/d/e.txt', 'content of e') - >>> zf.filename = 'mem/abcde.zip' - - Path accepts the zipfile object itself or a filename - - >>> root = Path(zf) - - From there, several path operations are available. - - Directory iteration (including the zip file itself): - - >>> a, b = root.iterdir() - >>> a - Path('mem/abcde.zip', 'a.txt') - >>> b - Path('mem/abcde.zip', 'b/') - - name property: - - >>> b.name - 'b' - - join with divide operator: - - >>> c = b / 'c.txt' - >>> c - Path('mem/abcde.zip', 'b/c.txt') - >>> c.name - 'c.txt' - - Read text: - - >>> c.read_text() - 'content of c' - - existence: - - >>> c.exists() - True - >>> (b / 'missing.txt').exists() - False - - Coercion to string: - - >>> import os - >>> str(c).replace(os.sep, posixpath.sep) - 'mem/abcde.zip/b/c.txt' - - At the root, ``name``, ``filename``, and ``parent`` - resolve to the zipfile. Note these attributes are not - valid and will raise a ``ValueError`` if the zipfile - has no filename. - - >>> root.name - 'abcde.zip' - >>> str(root.filename).replace(os.sep, posixpath.sep) - 'mem/abcde.zip' - >>> str(root.parent) - 'mem' - """ - - __repr = "{self.__class__.__name__}({self.root.filename!r}, {self.at!r})" - - def __init__(self, root, at=""): - """ - Construct a Path from a ZipFile or filename. - - Note: When the source is an existing ZipFile object, - its type (__class__) will be mutated to a - specialized type. If the caller wishes to retain the - original type, the caller should either create a - separate ZipFile object or pass a filename. - """ - self.root = FastLookup.make(root) - self.at = at - - def open(self, mode='r', *args, pwd=None, **kwargs): - """ - Open this entry as text or binary following the semantics - of ``pathlib.Path.open()`` by passing arguments through - to io.TextIOWrapper(). - """ - if self.is_dir(): - raise IsADirectoryError(self) - zip_mode = mode[0] - if not self.exists() and zip_mode == 'r': - raise FileNotFoundError(self) - stream = self.root.open(self.at, zip_mode, pwd=pwd) - if 'b' in mode: - if args or kwargs: - raise ValueError("encoding args invalid for binary operation") - return stream - # Text mode: - encoding, args, kwargs = _extract_text_encoding(*args, **kwargs) - return io.TextIOWrapper(stream, encoding, *args, **kwargs) - - @property - def name(self): - return pathlib.Path(self.at).name or self.filename.name - - @property - def suffix(self): - return pathlib.Path(self.at).suffix or self.filename.suffix - - @property - def suffixes(self): - return pathlib.Path(self.at).suffixes or self.filename.suffixes - - @property - def stem(self): - return pathlib.Path(self.at).stem or self.filename.stem - - @property - def filename(self): - return pathlib.Path(self.root.filename).joinpath(self.at) - - def read_text(self, *args, **kwargs): - encoding, args, kwargs = _extract_text_encoding(*args, **kwargs) - with self.open('r', encoding, *args, **kwargs) as strm: - return strm.read() - - def read_bytes(self): - with self.open('rb') as strm: - return strm.read() - - def _is_child(self, path): - return posixpath.dirname(path.at.rstrip("/")) == self.at.rstrip("/") - - def _next(self, at): - return self.__class__(self.root, at) - - def is_dir(self): - return not self.at or self.at.endswith("/") - - def is_file(self): - return self.exists() and not self.is_dir() - - def exists(self): - return self.at in self.root._name_set() - - def iterdir(self): - if not self.is_dir(): - raise ValueError("Can't listdir a file") - subs = map(self._next, self.root.namelist()) - return filter(self._is_child, subs) - - def __str__(self): - return posixpath.join(self.root.filename, self.at) - - def __repr__(self): - return self.__repr.format(self=self) - - def joinpath(self, *other): - next = posixpath.join(self.at, *other) - return self._next(self.root.resolve_dir(next)) - - __truediv__ = joinpath - - @property - def parent(self): - if not self.at: - return self.filename.parent - parent_at = posixpath.dirname(self.at.rstrip('/')) - if parent_at: - parent_at += '/' - return self._next(parent_at)