hpcaitech
diff --git a/‎README.md
Lines changed: 23 additions & 1 deletion b/‎README.md
Lines changed: 23 additions & 1 deletion
diff --git a/‎fastfold/data/feature_pipeline.py
Lines changed: 14 additions & 14 deletions b/‎fastfold/data/feature_pipeline.py
Lines changed: 14 additions & 14 deletions
diff --git a/‎fastfold/habana/__init__.py
Lines changed: 21 additions & 0 deletions b/‎fastfold/habana/__init__.py
Lines changed: 21 additions & 0 deletions
diff --git a/‎fastfold/habana/distributed/__init__.py
Lines changed: 8 additions & 0 deletions b/‎fastfold/habana/distributed/__init__.py
Lines changed: 8 additions & 0 deletions
diff --git a/‎fastfold/habana/distributed/comm.py
Lines changed: 188 additions & 0 deletions b/‎fastfold/habana/distributed/comm.py
Lines changed: 188 additions & 0 deletions
@@ -4,10 +4,17 @@
 
 [![](https://img.shields.io/badge/Paper-PDF-green?style=flat&logo=arXiv&logoColor=green)](https://arxiv.org/abs/2203.00854)
 ![](https://img.shields.io/badge/Made%20with-ColossalAI-blueviolet?style=flat)
+![](https://img.shields.io/badge/Habana-support-blue?style=flat&logo=intel&logoColor=blue)
 ![](https://img.shields.io/github/v/release/hpcaitech/FastFold)
 [![GitHub license](https://img.shields.io/github/license/hpcaitech/FastFold)](https://github.com/hpcaitech/FastFold/blob/main/LICENSE)
 
-Optimizing Protein Structure Prediction Model Training and Inference on GPU Clusters
+## News :triangular_flag_on_post:
+- [2023/01] Compatible with AlphaFold v2.3
+- [2023/01] Added support for inference and training of AlphaFold on [Intel Habana](https://habana.ai/) platform. For usage instructions, see [here](#Inference-or-Training-on-Intel-Habana).
+
+<br>
+
+Optimizing Protein Structure Prediction Model Training and Inference on Heterogeneous Clusters
 
 FastFold provides a **high-performance implementation of Evoformer** with the following characteristics.
 
@@ -201,6 +208,17 @@ python inference.py target.fasta data/pdb_mmcif/mmcif_files/ \
     --kalign_binary_path `which kalign`
 ```
 
+### Inference or Training on Intel Habana
+
+To run AlphaFold inference or training on Intel Habana, you can follow the instructions in the [Installation Guide](https://docs.habana.ai/en/latest/Installation_Guide/) to set up your environment on Amazon EC2 DL1 instances or on-premise environments.
+
+Once you have prepared your dataset and installed fastfold, you can use the following scripts:
+
+```shell
+bash habana/inference.sh
+bash habana/train.sh
+```
+
 ## Performance Benchmark
 
 We have included a performance benchmark script in `./benchmark`. You can benchmark the performance of Evoformer using different settings.
@@ -237,3 +255,7 @@ Cite this paper, if you use FastFold in your research publication.
       primaryClass={cs.LG}
 }
 ```
+
+## Acknowledgments
+
+We would like to extend our special thanks to the Intel Habana team for their support in providing us with technology and resources on the Habana platform.
@@ -20,6 +20,7 @@
 import numpy as np
 import torch
 
+import fastfold.habana as habana
 from fastfold.data import input_pipeline, input_pipeline_multimer
 
 
@@ -91,19 +92,18 @@ def np_example_to_features(
         np_example=np_example, features=feature_names
     )
 
-    with torch.no_grad():
-        if is_multimer:
-            features = input_pipeline_multimer.process_tensors_from_config(
-                tensor_dict,
-                cfg.common,
-                cfg[mode],
-            )
-        else:
-            features = input_pipeline.process_tensors_from_config(
-                tensor_dict,
-                cfg.common,
-                cfg[mode],
-            )
+    if is_multimer:
+        input_pipeline_fn = input_pipeline_multimer.process_tensors_from_config
+    else:
+        input_pipeline_fn = input_pipeline.process_tensors_from_config
+
+    if habana.is_habana():
+        from habana_frameworks.torch.hpex import hmp
+        with torch.no_grad(), hmp.disable_casts():
+            features = input_pipeline_fn(tensor_dict, cfg.common, cfg[mode])
+    else:
+        with torch.no_grad():
+            features = input_pipeline_fn(tensor_dict, cfg.common, cfg[mode])
 
     return {k: v for k, v in features.items()}
 
@@ -118,7 +118,7 @@ def __init__(
     def process_features(
         self,
         raw_features: FeatureDict,
-        mode: str = "train", 
+        mode: str = "train",
         is_multimer: bool = False,
     ) -> FeatureDict:
         return np_example_to_features(
 
@@ -0,0 +1,21 @@
+ENABLE_HABANA = False
+ENABLE_HMP = False
+
+def enable_habana():
+    global ENABLE_HABANA
+    ENABLE_HABANA = True
+    global ENABLE_LAZY_MODE
+    ENABLE_LAZY_MODE = True
+    import habana_frameworks.torch.core
+
+def is_habana():
+    global ENABLE_HABANA
+    return ENABLE_HABANA
+
+def enable_hmp():
+    global ENABLE_HMP
+    ENABLE_HMP = True
+
+def is_hmp():
+    global ENABLE_HMP
+    return ENABLE_HMP
@@ -0,0 +1,8 @@
+from .comm import (All_to_All, _gather, _reduce, _split, col_to_row, copy,
+                   gather, reduce, row_to_col, scatter)
+from .core import init_dist
+
+__all__ = [
+    'init_dist', '_reduce', '_split', '_gather', 'copy', 'scatter', 'reduce', 'gather',
+    'col_to_row', 'row_to_col', 'All_to_All'
+]
@@ -0,0 +1,188 @@
+from typing import Tuple
+
+import torch
+import torch.distributed as dist
+from torch import Tensor
+
+from .core import (ensure_divisibility, get_tensor_model_parallel_group,
+                   get_tensor_model_parallel_rank,
+                   get_tensor_model_parallel_world_size)
+
+
+def divide(numerator, denominator):
+    ensure_divisibility(numerator, denominator)
+    return numerator // denominator
+
+
+def _reduce(tensor: Tensor) -> Tensor:
+    if dist.get_world_size() == 1:
+        return tensor
+
+    dist.all_reduce(tensor,
+                    op=dist.ReduceOp.SUM,
+                    group=get_tensor_model_parallel_group(),
+                    async_op=False)
+
+    return tensor
+
+
+def _split(tensor: Tensor, dim: int = -1) -> Tensor:
+    if get_tensor_model_parallel_world_size() == 1:
+        return tensor
+
+    split_size = divide(tensor.shape[dim], get_tensor_model_parallel_world_size())
+    tensor_list = torch.split(tensor, split_size, dim=dim)
+
+    output = tensor_list[get_tensor_model_parallel_rank()].contiguous()
+
+    return output
+
+
+def _gather(tensor: Tensor, dim: int = -1) -> Tensor:
+    if get_tensor_model_parallel_world_size() == 1:
+        return tensor
+
+    if dim == 1 and list(tensor.shape)[0] == 1:
+        output_shape = list(tensor.shape)
+        output_shape[1] *= get_tensor_model_parallel_world_size()
+        output = torch.empty(output_shape, dtype=tensor.dtype, device=tensor.device)
+        tensor_list = output.chunk(get_tensor_model_parallel_world_size(), dim=1)
+        dist.all_gather(list(tensor_list),
+                        tensor,
+                        group=get_tensor_model_parallel_group(),
+                        async_op=False)
+    else:
+        tensor_list = [
+            torch.empty_like(tensor) for _ in range(get_tensor_model_parallel_world_size())
+        ]
+        dist.all_gather(tensor_list,
+                        tensor,
+                        group=get_tensor_model_parallel_group(),
+                        async_op=False)
+        output = torch.cat(tensor_list, dim=dim)
+
+    return output
+
+
+def copy(input: Tensor) -> Tensor:
+    if torch.is_grad_enabled() and input.requires_grad:
+        input = Copy.apply(input)
+    return input
+
+
+class Copy(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: "Copy", input: Tensor) -> Tensor:
+        return input
+
+    @staticmethod
+    def backward(ctx: "Copy", grad_output: Tensor) -> Tensor:
+        return _reduce(grad_output)
+
+
+def scatter(input: Tensor, dim: int = -1) -> Tensor:
+    if torch.is_grad_enabled() and input.requires_grad:
+        input = Scatter.apply(input, dim)
+    else:
+        input = _split(input, dim=dim)
+    return input
+
+
+class Scatter(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: "Scatter", input: Tensor, dim: int = -1) -> Tensor:
+        ctx.save_for_backward(torch.tensor([dim]))
+        return _split(input, dim=dim)
+
+    @staticmethod
+    def backward(ctx: "Scatter", grad_output: Tensor) -> Tuple[Tensor]:
+        dim, = ctx.saved_tensors
+        return _gather(grad_output, dim=int(dim)), None
+
+
+def reduce(input: Tensor) -> Tensor:
+    if torch.is_grad_enabled() and input.requires_grad:
+        input = Reduce.apply(input)
+    else:
+        input = _reduce(input)
+    return input
+
+
+class Reduce(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: "Reduce", input: Tensor) -> Tensor:
+        return _reduce(input)
+
+    @staticmethod
+    def backward(ctx: "Reduce", grad_output: Tensor) -> Tensor:
+        return grad_output
+
+
+def gather(input: Tensor, dim: int = -1) -> Tensor:
+    if torch.is_grad_enabled() and input.requires_grad:
+        input = Gather.apply(input, dim)
+    else:
+        input = _gather(input, dim=dim)
+    return input
+
+
+class Gather(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: "Gather", input: Tensor, dim: int = -1) -> Tensor:
+        ctx.save_for_backward(torch.tensor([dim]))
+        return _gather(input, dim=dim)
+
+    @staticmethod
+    def backward(ctx: "Gather", grad_output: Tensor) -> Tuple[Tensor]:
+        dim, = ctx.saved_tensors
+        return _split(grad_output, dim=int(dim)), None
+
+
+def _all_to_all(tensor: Tensor, in_dim: int = -1, out_dim: int = -1) -> Tensor:
+    if dist.get_world_size() == 1:
+        return tensor
+
+    tensor = tensor.transpose(in_dim, 0).contiguous()
+
+    output = torch.empty_like(tensor)
+    dist.all_to_all_single(output, tensor, group=get_tensor_model_parallel_group())
+
+    output = output.transpose(in_dim, 0).contiguous()
+
+    tensor_list = output.chunk(get_tensor_model_parallel_world_size(), dim=in_dim)
+
+    return torch.cat(tensor_list, dim=out_dim)
+
+
+def col_to_row(input_: Tensor) -> Tensor:
+    if torch.is_grad_enabled() and input_.requires_grad:
+        input_ = All_to_All.apply(input_, 1, 2)
+    else:
+        input_ = _all_to_all(input_, in_dim=1, out_dim=2)
+    return input_
+
+
+def row_to_col(input_: Tensor) -> Tensor:
+    if torch.is_grad_enabled() and input_.requires_grad:
+        input_ = All_to_All.apply(input_, 2, 1)
+    else:
+        input_ = _all_to_all(input_, in_dim=2, out_dim=1)
+    return input_
+
+
+class All_to_All(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: "All_to_All", input_: Tensor, in_dim: int = -1, out_dim: int = -1) -> Tensor:
+        ctx.save_for_backward(torch.tensor([in_dim, out_dim]))
+        return _all_to_all(input_, in_dim=in_dim, out_dim=out_dim)
+
+    @staticmethod
+    def backward(ctx: "All_to_All", grad_output: Tensor) -> Tuple[Tensor]:
+        saved_tensors = ctx.saved_tensors[0]
+        return _all_to_all(grad_output, in_dim=int(saved_tensors[1]),
+                           out_dim=int(saved_tensors[0])), None, None