use cpu model as input

Gasoonjia · Gasoonjia · commit 78a4e53a9ef8 · 2025-09-10T00:22:42.000-07:00
diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py
@@ -21,6 +21,7 @@
 )
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from torch._inductor.codegen.cpp_wrapper_cpu import CppWrapperCpu
+from torch.export.passes import move_to_device_pass
 
 
 # exist fallback operators in et namespace;
@@ -71,14 +72,33 @@ def preprocess(
         edge_program: ExportedProgram,
         compile_specs: List[CompileSpec],
     ) -> PreprocessResult:
+
         print("entering  the lowerable parts in AotiBackend.preprocess....")
         named_data_store = NamedDataStore()
 
         # print("here", edge_program.example_inputs)
         copy_edge_program = copy.deepcopy(edge_program)
+
+        # Move the edge_program from CPU to CUDA using move_to_device_pass
+        copy_edge_program = move_to_device_pass(copy_edge_program, "cuda")
         # graph_module = copy_edge_program.graph_module
         edge_program_module = copy_edge_program.module()
         args, kwargs = copy_edge_program.example_inputs
+
+        # Deep copy args and move tensors to CUDA for aot_compile
+        def move_to_cuda(obj):
+            if isinstance(obj, torch.Tensor):
+                return obj.cuda()
+            elif isinstance(obj, (list, tuple)):
+                return type(obj)(move_to_cuda(item) for item in obj)
+            elif isinstance(obj, dict):
+                return {key: move_to_cuda(value) for key, value in obj.items()}
+            else:
+                return obj
+
+        args = move_to_cuda(copy.deepcopy(args))
+        kwargs = move_to_cuda(copy.deepcopy(kwargs))
+
         # print("args, kwargs", args, kwargs)
         print("len(args)", len(args))
         print("args[0].shape", args[0].shape)
diff --git a/backends/aoti/runtime/aoti_backend.cpp b/backends/aoti/runtime/aoti_backend.cpp
@@ -83,7 +83,7 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface {
     std::string so_blob_key = "so_blob";
 
     Result<FreeableBuffer> aoti_cuda_buffer =
-        named_data_map->get_data(aoti_cuda_blob_name.c_str());
+        named_data_map->get_data(so_blob_key.c_str());
 
     // Create a temporary file
     std::ofstream outfile(so_path.c_str(), std::ios::binary);
diff --git a/exir/program/_program.py b/exir/program/_program.py
@@ -1680,7 +1680,7 @@ def exported_program_to_device(exported_program, device):
 
         execution_programs: Dict[str, ExportedProgram] = {}
         for name, program in self._edge_programs.items():
-            program = exported_program_to_device(program, "cpu")
+            # program = exported_program_to_device(program, "cpu")
             if config.do_quant_fusion_and_const_prop:
                 if program.graph_signature.backward_signature is not None:
                     raise Exception(
diff --git a/export_aoti.py b/export_aoti.py
@@ -264,73 +264,61 @@ def forward(self, x):
     "mv2": {
         "model_class": MV2,
         "input_shapes": [(1, 3, 224, 224)],
-        "device": "cuda",
         "description": "MobileNetV2 model",
     },
     "resnet18": {
         "model_class": ResNet18,
         "input_shapes": [(1, 3, 224, 224)],
-        "device": "cuda",
         "description": "ResNet18 model",
     },
     "linear": {
         "model_class": Linear,
         "input_shapes": [(127, 7)],
-        "device": "cuda",
         "description": "Simple linear layer model",
     },
     "conv2d": {
         "model_class": SingleConv2d,
         "input_shapes": [(4, 3, 8, 8)],
-        "device": "cuda",
         "description": "Single Conv2d layer model",
     },
     "depthwise_conv": {
         "model_class": DepthwiseConv,
         "input_shapes": [(1, 32, 112, 112)],
-        "device": "cuda",
         "description": "Single Depthwise Conv2d layer model",
     },
     "add": {
         "model_class": Add,
         "input_shapes": [(10,), (10,)],
-        "device": "cuda",
         "description": "Simple tensor addition model",
     },
     "batchnorm": {
         "model_class": BatchNorm,
         "input_shapes": [(1, 16, 32, 32)],
-        "device": "cuda",
         "description": "Single BatchNorm2d layer model",
     },
     "single_resnet_block": {
         "model_class": SingleResNetBlock,
         "input_shapes": [(1, 64, 8, 8)],
-        "device": "cuda",
         "description": "Single ResNet block with skip connection",
     },
     "llama31": {
         "model_class": Llama31,
         "input_shapes": [(1, 32)],  # batch_size=1, sequence_length=128
-        "device": "cuda",
         "description": "Llama 3.1 model with KV cache disabled",
     },
     "whisper": {
         "model_class": Whisper,
         "input_shapes": [(1, 80, 3000)],
-        "device": "cuda",
         "description": "OpenAI Whisper ASR model. now is encoder only",
     },
     "conv1d": {
         "model_class": MockConv1d,
         "input_shapes": [(1, 80, 3000)],
-        "device": "cuda",
         "description": "Conv1d layer with 80 input channels, 384 output channels",
     },
     "transformer_block": {
         "model_class": TransformerBlock,
         "input_shapes": [(4, 32, 256)],  # batch_size=4, seq_len=32, embed_dim=256
-        "device": "cuda",
         "description": "Single transformer block with multi-head attention and feed-forward network",
     },
 }
@@ -350,7 +338,7 @@ def get_model_and_inputs(
     model_config = MODEL_REGISTRY[model_name]
     model_class = model_config["model_class"]
     input_shapes = model_config["input_shapes"]
-    device = model_config["device"]
+    device = "cpu"
 
     # Create model instance
     model = model_class().to(device).eval()