MANIFEST.in and setup.py clean-up (Lightning-AI#7614)

carmocca · web-flow · commit 3d2d0f25362f · 2021-11-19T15:38:42.000+01:00
diff --git a/.github/workflows/ci_pkg-install.yml b/.github/workflows/ci_pkg-install.yml
@@ -26,12 +26,11 @@ jobs:
 
       - name: Prepare env
         run: |
-          pip install check-manifest "twine==3.2" setuptools wheel
+          pip install "twine==3.2" setuptools wheel
 
       - name: Create package
         run: |
-          check-manifest
-          # python setup.py check --metadata --strict
+          python setup.py check --metadata --strict
           python setup.py sdist bdist_wheel
 
       - name: Check package
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -11,69 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-# Manifest syntax https://docs.python.org/2/distutils/sourcedist.html
-graft wheelhouse
-
-recursive-exclude __pycache__  *.py[cod] *.orig
-
-# Include the README and CHANGELOG
-include *.md
-
-# Include the license file
-include LICENSE
-
-# Include the citation info
-include *.cff
-
-exclude *.sh
-exclude *.svg
-recursive-include pytorch_lightning *.py
-
-# Include marker file for PEP 561
-include pytorch_lightning/py.typed
-
-# include examples
-recursive-include pl_examples *.py *.md *.sh *.txt *.toml
-
-# exclude tests from package
-recursive-exclude tests *
-recursive-exclude site *
-exclude tests
-
-# Exclude the documentation files
-recursive-exclude docs *
-exclude docs
-recursive-include docs/source/_static/images/logos/ *
-recursive-include docs/source/_static/images/general/ pl_overview* tf_* tutorial_* PTL101_*
-
-# Include the Requirements
+include pytorch_lightning/py.typed  # marker file for PEP 561
+include CHANGELOG.md
 recursive-include requirements *.txt
-recursive-exclude requirements *.sh *.py
 include requirements.txt
-include pyproject.toml
-
-# Exclude build configs
-exclude *.yml
-exclude *.yaml
-exclude *.toml
-exclude *.jsonnet
-
-# Exclude pyright config
-exclude .pyrightconfig.json
-
-# Exclude submodules
-exclude .gitmodules
-exclude _notebooks
-
-# Exclude Makefile
-exclude Makefile
-
-prune .git
-prune .github
-prune .circleci
-prune temp*
-prune test*
-prune benchmark*
-prune dockers
-prune legacy
+include *.cff  # citation info
diff --git a/pl_examples/basic_examples/mnist_datamodule.py b/pl_examples/basic_examples/mnist_datamodule.py
@@ -11,13 +11,18 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import logging
 import os
 import platform
-from typing import Optional
+import random
+import time
+import urllib
+from typing import Optional, Tuple
 from urllib.error import HTTPError
 from warnings import warn
 
-from torch.utils.data import DataLoader, random_split
+import torch
+from torch.utils.data import DataLoader, Dataset, random_split
 
 from pl_examples import _DATASETS_PATH
 from pytorch_lightning import LightningDataModule
@@ -27,6 +32,97 @@
     from torchvision import transforms as transform_lib
 
 
+class _MNIST(Dataset):
+    """Carbon copy of ``tests.helpers.datasets.MNIST``.
+
+    We cannot import the tests as they are not distributed with the package.
+    See https://github.com/PyTorchLightning/pytorch-lightning/pull/7614#discussion_r671183652 for more context.
+    """
+
+    RESOURCES = (
+        "https://pl-public-data.s3.amazonaws.com/MNIST/processed/training.pt",
+        "https://pl-public-data.s3.amazonaws.com/MNIST/processed/test.pt",
+    )
+
+    TRAIN_FILE_NAME = "training.pt"
+    TEST_FILE_NAME = "test.pt"
+    cache_folder_name = "complete"
+
+    def __init__(
+        self, root: str, train: bool = True, normalize: tuple = (0.1307, 0.3081), download: bool = True, **kwargs
+    ):
+        super().__init__()
+        self.root = root
+        self.train = train  # training set or test set
+        self.normalize = normalize
+
+        self.prepare_data(download)
+
+        data_file = self.TRAIN_FILE_NAME if self.train else self.TEST_FILE_NAME
+        self.data, self.targets = self._try_load(os.path.join(self.cached_folder_path, data_file))
+
+    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, int]:
+        img = self.data[idx].float().unsqueeze(0)
+        target = int(self.targets[idx])
+
+        if self.normalize is not None and len(self.normalize) == 2:
+            img = self.normalize_tensor(img, *self.normalize)
+
+        return img, target
+
+    def __len__(self) -> int:
+        return len(self.data)
+
+    @property
+    def cached_folder_path(self) -> str:
+        return os.path.join(self.root, "MNIST", self.cache_folder_name)
+
+    def _check_exists(self, data_folder: str) -> bool:
+        existing = True
+        for fname in (self.TRAIN_FILE_NAME, self.TEST_FILE_NAME):
+            existing = existing and os.path.isfile(os.path.join(data_folder, fname))
+        return existing
+
+    def prepare_data(self, download: bool = True):
+        if download and not self._check_exists(self.cached_folder_path):
+            self._download(self.cached_folder_path)
+        if not self._check_exists(self.cached_folder_path):
+            raise RuntimeError("Dataset not found.")
+
+    def _download(self, data_folder: str) -> None:
+        os.makedirs(data_folder, exist_ok=True)
+        for url in self.RESOURCES:
+            logging.info(f"Downloading {url}")
+            fpath = os.path.join(data_folder, os.path.basename(url))
+            urllib.request.urlretrieve(url, fpath)
+
+    @staticmethod
+    def _try_load(path_data, trials: int = 30, delta: float = 1.0):
+        """Resolving loading from the same time from multiple concurrent processes."""
+        res, exception = None, None
+        assert trials, "at least some trial has to be set"
+        assert os.path.isfile(path_data), f"missing file: {path_data}"
+        for _ in range(trials):
+            try:
+                res = torch.load(path_data)
+            # todo: specify the possible exception
+            except Exception as e:
+                exception = e
+                time.sleep(delta * random.random())
+            else:
+                break
+        if exception is not None:
+            # raise the caught exception
+            raise exception
+        return res
+
+    @staticmethod
+    def normalize_tensor(tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0) -> torch.Tensor:
+        mean = torch.as_tensor(mean, dtype=tensor.dtype, device=tensor.device)
+        std = torch.as_tensor(std, dtype=tensor.dtype, device=tensor.device)
+        return tensor.sub(mean).div(std)
+
+
 def MNIST(*args, **kwargs):
     torchvision_mnist_available = not bool(os.getenv("PL_USE_MOCKED_MNIST", False))
     if torchvision_mnist_available:
@@ -39,7 +135,7 @@ def MNIST(*args, **kwargs):
             torchvision_mnist_available = False
     if not torchvision_mnist_available:
         print("`torchvision.datasets.MNIST` not available. Using our hosted version")
-        from tests.helpers.datasets import MNIST
+        MNIST = _MNIST
     return MNIST(*args, **kwargs)
 
 
diff --git a/pl_examples/run_examples.sh b/pl_examples/run_examples.sh
@@ -1,6 +1,7 @@
 #!/bin/bash
 set -ex
 
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
 dir_path=$(dirname "${BASH_SOURCE[0]}")
 args="
   --data.batch_size=32
diff --git a/pl_examples/test_examples.py b/pl_examples/test_examples.py
@@ -14,9 +14,10 @@
 from unittest import mock
 
 import pytest
+import torch
 
 from pl_examples import _DALI_AVAILABLE
-from tests.helpers.runif import RunIf
+from pytorch_lightning.utilities.imports import _IS_WINDOWS
 
 ARGS_DEFAULT = (
     "--trainer.default_root_dir %(tmpdir)s "
@@ -31,7 +32,8 @@
 
 
 @pytest.mark.skipif(not _DALI_AVAILABLE, reason="Nvidia DALI required")
-@RunIf(min_gpus=1, skip_windows=True)
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required")
+@pytest.mark.skipif(_IS_WINDOWS, reason="Not supported on Windows")
 @pytest.mark.parametrize("cli_args", [ARGS_GPU])
 def test_examples_mnist_dali(tmpdir, cli_args):
     from pl_examples.integration_examples.dali_image_classifier import cli_main
diff --git a/requirements/test.txt b/requirements/test.txt
@@ -2,7 +2,6 @@ coverage>5.2.0
 codecov>=2.1
 pytest>=6.0
 pytest-rerunfailures>=10.2
-check-manifest
 twine==3.2
 mypy>=0.900
 flake8>=3.9.2
diff --git a/setup.cfg b/setup.cfg
@@ -73,14 +73,6 @@ ignore =
     W503  # Ignore "Line break occurred before a binary operator"
     E203  # Ignore "whitespace before ':'"
 
-# setup.cfg or tox.ini
-[check-manifest]
-ignore =
-    *.yml
-    .github
-    .github/*
-    .circleci
-
 
 [metadata]
 license_file = LICENSE
diff --git a/setup.py b/setup.py
@@ -74,10 +74,10 @@ def _load_py_module(fname, pkg="pytorch_lightning"):
     url=about.__homepage__,
     download_url="https://github.com/PyTorchLightning/pytorch-lightning",
     license=about.__license__,
-    packages=find_packages(exclude=["tests", "tests/*", "benchmarks", "legacy", "legacy/*"]),
+    packages=find_packages(exclude=["tests*", "pl_examples*", "legacy*"]),
+    include_package_data=True,
     long_description=long_description,
     long_description_content_type="text/markdown",
-    include_package_data=True,
     zip_safe=False,
     keywords=["deep learning", "pytorch", "AI"],
     python_requires=">=3.6",
diff --git a/tests/helpers/datasets.py b/tests/helpers/datasets.py
@@ -19,7 +19,6 @@
 from typing import Optional, Sequence, Tuple
 
 import torch
-from torch import Tensor
 from torch.utils.data import Dataset
 
 
@@ -70,7 +69,7 @@ def __init__(
         data_file = self.TRAIN_FILE_NAME if self.train else self.TEST_FILE_NAME
         self.data, self.targets = self._try_load(os.path.join(self.cached_folder_path, data_file))
 
-    def __getitem__(self, idx: int) -> Tuple[Tensor, int]:
+    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, int]:
         img = self.data[idx].float().unsqueeze(0)
         target = int(self.targets[idx])
 
@@ -126,7 +125,7 @@ def _try_load(path_data, trials: int = 30, delta: float = 1.0):
         return res
 
     @staticmethod
-    def normalize_tensor(tensor: Tensor, mean: float = 0.0, std: float = 1.0) -> Tensor:
+    def normalize_tensor(tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0) -> torch.Tensor:
         mean = torch.as_tensor(mean, dtype=tensor.dtype, device=tensor.device)
         std = torch.as_tensor(std, dtype=tensor.dtype, device=tensor.device)
         return tensor.sub(mean).div(std)
diff --git a/tests/special_tests.sh b/tests/special_tests.sh
@@ -81,6 +81,7 @@ fi
 # report+="Ran\ttests/plugins/environments/torch_elastic_deadlock.py\n"
 
 # test that a user can manually launch individual processes
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
 args="--trainer.gpus 2 --trainer.strategy ddp --trainer.max_epochs=1 --trainer.limit_train_batches=1 --trainer.limit_val_batches=1 --trainer.limit_test_batches=1"
 MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=1 python pl_examples/basic_examples/mnist_examples/image_classifier_5_lightning_datamodule.py ${args} &
 MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=0 python pl_examples/basic_examples/mnist_examples/image_classifier_5_lightning_datamodule.py ${args}