Add auto_device_count and device name support (#13423)

jerome-habana · kaushikb11 · rohitgr7 · web-flow · commit 9596fabe7baa · 2022-07-22T10:29:02.000+05:30
Co-authored-by: Kaushik B &lt;45285388+kaushikb11@users.noreply.github.com&gt;
Co-authored-by: Rohit Gupta &lt;rohitgr1998@gmail.com&gt;
Co-authored-by: Justus Schock &lt;12886177+justusschock@users.noreply.github.com&gt;
Co-authored-by: Jirka &lt;jirka.borovec@seznam.cz&gt;
Co-authored-by: dependabot[bot] &lt;49699333+dependabot[bot]@users.noreply.github.com&gt;
Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
Co-authored-by: Akihiro Nitta &lt;nitta@akihironitta.com&gt;
Co-authored-by: awaelchli &lt;aedu.waelchli@gmail.com&gt;
Co-authored-by: Carlos Mocholí &lt;carlossmocholi@gmail.com&gt;
Co-authored-by: Adrian Wälchli &lt;aedu.waelchli@gmail.com&gt;
Co-authored-by: ananthsub &lt;ananth.subramaniam@gmail.com&gt;
Co-authored-by: mansy &lt;mansy@lightning.ai&gt;
Co-authored-by: manskx &lt;mansy@lightning.ai&gt;
Co-authored-by: Jirka Borovec &lt;Borda@users.noreply.github.com&gt;
Co-authored-by: Mansy &lt;ahmed.mansy156@gmail.com&gt;
Co-authored-by: otaj &lt;ota@lightning.ai&gt;
Co-authored-by: Sean Naren &lt;sean@grid.ai&gt;
Co-authored-by: Keiichi Kuroyanagi &lt;kuroyanagi.keiichi@gmail.com&gt;
Co-authored-by: Martino Sorbaro &lt;martinosorb@users.noreply.github.com&gt;
Co-authored-by: Wang Ran (汪然) &lt;wangr@smail.nju.edu.cn&gt;
Co-authored-by: Rhys Goodall &lt;rhys.goodall@outlook.com&gt;
Co-authored-by: Siyuan Li &lt;siyuanli.s.c@gmail.com&gt;
Co-authored-by: Ekagra Ranjan &lt;ekagra.ranjan@gmail.com&gt;
Co-authored-by: S. Kumano &lt;54502860+s-kumano@users.noreply.github.com&gt;
Co-authored-by: otaj &lt;6065855+otaj@users.noreply.github.com&gt;
Co-authored-by: Gautier Dagan &lt;gautierdagan2017@u.northwestern.edu&gt;
Co-authored-by: Sherin Thomas &lt;sherinct@live.com&gt;
Co-authored-by: Cyprien Ricque &lt;48893621+Cyprien-Ricque@users.noreply.github.com&gt;
Co-authored-by: Masahiro Wada &lt;argon.argon.argon@gmail.com&gt;
Co-authored-by: nitinramvelraj &lt;98356761+nitinramvelraj@users.noreply.github.com&gt;
Co-authored-by: donlapark &lt;10988155+donlapark@users.noreply.github.com&gt;
Co-authored-by: Justin Goheen &lt;26209687+JustinGoheen@users.noreply.github.com&gt;
Co-authored-by: Shantam Gilra &lt;64306405+shantam-8@users.noreply.github.com&gt;
Co-authored-by: Bibhabasu Mohapatra &lt;68384968+bibhabasumohapatra@users.noreply.github.com&gt;
Co-authored-by: Jimmy Yao &lt;jiahaoyao.math@gmail.com&gt;
Co-authored-by: Nikhil Shenoy &lt;nikhilshenoy98@gmail.com&gt;
Co-authored-by: Sanjay Aradhyamath &lt;57592361+samz5320@users.noreply.github.com&gt;
diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
@@ -147,6 +147,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Updated `val_check_interval`(int) to consider total train batches processed instead of `_batches_that_stepped` for validation check during training ([#12832](https://github.com/Lightning-AI/lightning/pull/12832)
 
 
+- Updated Habana Accelerator's `auto_device_count`, `is_available` & `get_device_name` methods based on the latest torch habana package ([#13423](https://github.com/PyTorchLightning/pytorch-lightning/pull/13423))
+
+
+-
+
+
 ### Deprecated
 
 - Deprecated `pytorch_lightning.loggers.base.LightningLoggerBase` in favor of `pytorch_lightning.loggers.logger.Logger`, and deprecated `pytorch_lightning.loggers.base` in favor of `pytorch_lightning.loggers.logger` ([#120148](https://github.com/PyTorchLightning/pytorch-lightning/pull/12014))
diff --git a/src/pytorch_lightning/accelerators/hpu.py b/src/pytorch_lightning/accelerators/hpu.py
@@ -21,6 +21,9 @@
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.rank_zero import rank_zero_debug
 
+if _HPU_AVAILABLE:
+    import habana_frameworks.torch.hpu as torch_hpu
+
 
 class HPUAccelerator(Accelerator):
     """Accelerator for HPU devices."""
@@ -52,13 +55,28 @@ def get_parallel_devices(devices: int) -> List[torch.device]:
 
     @staticmethod
     def auto_device_count() -> int:
-        """Get the devices when set to auto."""
-        # TODO(@kaushikb11): Update this when api is exposed by the Habana team
-        return 8
+        """Returns the number of HPU devices when the devices is set to auto."""
+        try:
+            return torch_hpu.device_count()
+        except (AttributeError, NameError):
+            rank_zero_debug("HPU `auto_device_count` failed, returning default count of 8.")
+            return 8
 
     @staticmethod
     def is_available() -> bool:
-        return _HPU_AVAILABLE
+        """Returns a bool indicating if HPU is currently available."""
+        try:
+            return torch_hpu.is_available()
+        except (AttributeError, NameError):
+            return False
+
+    @staticmethod
+    def get_device_name() -> str:
+        """Returns the name of the HPU device."""
+        try:
+            return torch_hpu.get_device_name()
+        except (AttributeError, NameError):
+            return ""
 
     @classmethod
     def register_accelerators(cls, accelerator_registry: Dict) -> None:
diff --git a/src/pytorch_lightning/strategies/hpu_parallel.py b/src/pytorch_lightning/strategies/hpu_parallel.py
@@ -32,7 +32,7 @@
 
 if _HPU_AVAILABLE:
     import habana_frameworks.torch.core as htcore
-    import habana_frameworks.torch.core.hccl  # noqa: F401
+    import habana_frameworks.torch.distributed.hccl  # noqa: F401
 
 log = logging.getLogger(__name__)
 
diff --git a/src/pytorch_lightning/strategies/single_hpu.py b/src/pytorch_lightning/strategies/single_hpu.py
@@ -24,7 +24,6 @@
 
 if _HPU_AVAILABLE:
     import habana_frameworks.torch.core as htcore
-    import habana_frameworks.torch.core.hccl  # noqa: F401
 
 
 class SingleHPUStrategy(SingleDeviceStrategy):
diff --git a/tests/tests_pytorch/accelerators/test_hpu.py b/tests/tests_pytorch/accelerators/test_hpu.py
@@ -40,6 +40,11 @@ def test_availability():
     assert HPUAccelerator.is_available()
 
 
+@RunIf(hpu=True)
+def test_device_name():
+    assert HPUAccelerator.get_device_name() == "GAUDI"
+
+
 @pytest.mark.skipif(_HPU_AVAILABLE, reason="test requires non-HPU machine")
 def test_fail_if_no_hpus():
     with pytest.raises(MisconfigurationException, match="HPUAccelerator can not run on your system"):
@@ -239,6 +244,7 @@ def test_inference_only(tmpdir, hpus):
     trainer.predict(model)
 
 
+@RunIf(hpu=True)
 def test_hpu_auto_device_count():
     assert HPUAccelerator.auto_device_count() == 8