skypilot-org
diff --git a/‎sky/backends/backend_utils.py
+19-106 b/‎sky/backends/backend_utils.py
+19-106
@@ -1,7 +1,6 @@
 """Util constants/functions for the backends."""
 from datetime import datetime
 import enum
-import json
 import os
 import pathlib
 import pprint
@@ -40,9 +39,9 @@
 from sky import status_lib
 from sky.backends import onprem_utils
 from sky.clouds import cloud_registry
+from sky.clouds.utils import gcp_utils
 from sky.provision import instance_setup
 from sky.skylet import constants
-from sky.skylet import log_lib
 from sky.usage import usage_lib
 from sky.utils import cluster_yaml_utils
 from sky.utils import command_runner
@@ -52,7 +51,6 @@
 from sky.utils import rich_utils
 from sky.utils import subprocess_utils
 from sky.utils import timeline
-from sky.utils import tpu_utils
 from sky.utils import ux_utils
 
 if typing.TYPE_CHECKING:
@@ -1107,7 +1105,7 @@ def write_cluster_config(
     usage_lib.messages.usage.update_ray_yaml(yaml_path)
 
     # For TPU nodes. TPU VMs do not need TPU_NAME.
-    if tpu_utils.is_tpu(to_provision) and not tpu_utils.is_tpu_vm(to_provision):
+    if gcp_utils.is_tpu(to_provision) and not gcp_utils.is_tpu_vm(to_provision):
         tpu_name = resources_vars.get('tpu_name')
         if tpu_name is None:
             tpu_name = cluster_name
@@ -1528,19 +1526,22 @@ def _query_head_ip_with_retries(cluster_yaml: str,
 
 
 @timeline.event
-def get_node_ips(cluster_yaml: str,
-                 expected_num_nodes: int,
-                 handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle',
-                 head_ip_max_attempts: int = 1,
-                 worker_ip_max_attempts: int = 1,
-                 get_internal_ips: bool = False) -> List[str]:
+def get_node_ips(
+        cluster_yaml: str,
+        expected_num_nodes: int,
+        # TODO: remove this argument once we remove the legacy on-prem
+        # support.
+        handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle',
+        head_ip_max_attempts: int = 1,
+        worker_ip_max_attempts: int = 1,
+        get_internal_ips: bool = False) -> List[str]:
     """Returns the IPs of all nodes in the cluster, with head node at front.
 
     Args:
         cluster_yaml: Path to the cluster yaml.
         expected_num_nodes: Expected number of nodes in the cluster.
-        handle: Cloud VM Ray resource handle. It is only required for TPU VM or
-            on-prem clusters.
+        handle: Cloud VM Ray resource handle. It is only required for on-prem
+            clusters.
         head_ip_max_attempts: Max attempts to get head ip.
         worker_ip_max_attempts: Max attempts to get worker ips.
         get_internal_ips: Whether to get internal IPs. When False, it is still
@@ -1551,18 +1552,13 @@ def get_node_ips(cluster_yaml: str,
         exceptions.FetchIPError: if we failed to get the IPs. e.reason is
             HEAD or WORKER.
     """
-    # When ray up launches TPU VM Pod, Pod workers (except for the head)
-    # won't be connected to Ray cluster. Thus "ray get-worker-ips"
-    # won't work and we need to query the node IPs with gcloud as
-    # implmented in _get_tpu_vm_pod_ips.
     ray_config = common_utils.read_yaml(cluster_yaml)
     # Use the new provisioner for AWS.
     provider_name = cluster_yaml_utils.get_provider_name(ray_config)
     cloud = cloud_registry.CLOUD_REGISTRY.from_str(provider_name)
     assert cloud is not None, provider_name
 
-    if (cloud.PROVISIONER_VERSION >= clouds.ProvisionerVersion.SKYPILOT and
-            not tpu_utils.is_tpu_vm(handle.launched_resources)):
+    if cloud.PROVISIONER_VERSION >= clouds.ProvisionerVersion.SKYPILOT:
         metadata = provision_lib.get_cluster_info(
             provider_name, ray_config['provider']['region'],
             ray_config['cluster_name'], ray_config['provider'])
@@ -1571,20 +1567,6 @@ def get_node_ips(cluster_yaml: str,
             raise exceptions.FetchIPError(exceptions.FetchIPError.Reason.HEAD)
         return metadata.get_feasible_ips(get_internal_ips)
 
-    use_tpu_vm = ray_config['provider'].get('_has_tpus', False)
-    if use_tpu_vm:
-        assert expected_num_nodes == 1, (
-            'TPU VM only supports single node for now.')
-        assert handle is not None, 'handle is required for TPU VM.'
-        try:
-            ips = _get_tpu_vm_pod_ips(ray_config, get_internal_ips)
-        except exceptions.CommandError as e:
-            raise exceptions.FetchIPError(
-                exceptions.FetchIPError.Reason.HEAD) from e
-        if len(ips) != tpu_utils.get_num_tpu_devices(handle.launched_resources):
-            raise exceptions.FetchIPError(exceptions.FetchIPError.Reason.HEAD)
-        return ips
-
     if get_internal_ips:
         with tempfile.NamedTemporaryFile(mode='w', delete=False) as f:
             ray_config['provider']['use_internal_ips'] = True
@@ -1658,74 +1640,6 @@ def get_node_ips(cluster_yaml: str,
     return head_ip_list + worker_ips
 
 
-@timeline.event
-def _get_tpu_vm_pod_ips(ray_config: Dict[str, Any],
-                        get_internal_ips: bool = False) -> List[str]:
-    """Returns the IPs of all TPU VM Pod workers using gcloud."""
-
-    cluster_name = ray_config['cluster_name']
-    zone = ray_config['provider']['availability_zone']
-    query_cmd = (f'gcloud compute tpus tpu-vm list --filter='
-                 f'"(labels.ray-cluster-name={cluster_name})" '
-                 f'--zone={zone} --format="value(name)"')
-    returncode, stdout, stderr = log_lib.run_with_log(query_cmd,
-                                                      '/dev/null',
-                                                      shell=True,
-                                                      stream_logs=False,
-                                                      require_outputs=True)
-    subprocess_utils.handle_returncode(
-        returncode,
-        query_cmd,
-        'Failed to run gcloud to get TPU VM IDs.',
-        stderr=stdout + stderr)
-    if len(stdout) == 0:
-        logger.debug('No TPU VMs found with cluster name '
-                     f'{cluster_name} in zone {zone}.')
-    if len(stdout.splitlines()) > 1:
-        # Rare case, this could mean resource leakage. Hint user.
-        logger.warning('Found more than one TPU VM/Pod with the same cluster '
-                       f'name {cluster_name} in zone {zone}.')
-
-    all_ips = []
-    for tpu_id in stdout.splitlines():
-        tpuvm_cmd = (f'gcloud compute tpus tpu-vm describe {tpu_id}'
-                     f' --zone {zone} --format=json')
-        returncode, stdout, stderr = log_lib.run_with_log(tpuvm_cmd,
-                                                          os.devnull,
-                                                          shell=True,
-                                                          stream_logs=False,
-                                                          require_outputs=True)
-        subprocess_utils.handle_returncode(
-            returncode,
-            tpuvm_cmd,
-            'Failed to run gcloud tpu-vm describe.',
-            stderr=stdout + stderr)
-
-        tpuvm_json = json.loads(stdout)
-        if tpuvm_json['state'] != 'READY':
-            # May be a leaked preempted resource, or terminated by user in the
-            # console, or still in the process of being created.
-            ux_utils.console_newline()
-            logger.debug(f'TPU VM {tpu_id} is in {tpuvm_json["state"]} '
-                         'state. Skipping IP query... '
-                         'Hint: make sure it is not leaked.')
-            continue
-
-        ips = []
-        for endpoint in tpuvm_json['networkEndpoints']:
-            # Note: if TPU VM is being preempted, its IP field may not exist.
-            # We use get() to avoid KeyError.
-            if get_internal_ips:
-                ip = endpoint.get('ipAddress', None)
-            else:
-                ip = endpoint['accessConfig'].get('externalIp', None)
-            if ip is not None:
-                ips.append(ip)
-        all_ips.extend(ips)
-
-    return all_ips
-
-
 def check_network_connection():
     # Tolerate 3 retries as it is observed that connections can fail.
     adapter = adapters.HTTPAdapter(max_retries=retry_lib.Retry(total=3))
@@ -2014,9 +1928,8 @@ def run_ray_status_to_check_ray_cluster_healthy() -> bool:
             # in the worst case we time out in the `ray status` SSH command
             # below.
             external_ips = handle.cached_external_ips
-            # This happens to a stopped TPU VM as we use gcloud to query the IP.
-            # Or user interrupt the `sky launch` process before the first time
-            # resources handle is written back to local database.
+            # This happens when user interrupt the `sky launch` process before
+            # the first time resources handle is written back to local database.
             # This is helpful when user interrupt after the provision is done
             # and before the skylet is restarted. After #2304 is merged, this
             # helps keep the cluster status to INIT after `sky status -r`, so
@@ -2054,13 +1967,13 @@ def run_ray_status_to_check_ray_cluster_healthy() -> bool:
                     f'-- stdout --\n{output}\n-- stderr --\n{stderr}')
 
             ready_head, ready_workers = _count_healthy_nodes_from_ray(output)
-
-            if ready_head + ready_workers == handle.launched_nodes:
+            total_nodes = handle.launched_nodes * handle.num_node_ips
+            if ready_head + ready_workers == total_nodes:
                 return True
             raise RuntimeError(
                 f'Refreshing status ({cluster_name!r}): ray status not showing '
                 f'all nodes ({ready_head + ready_workers}/'
-                f'{handle.launched_nodes}); output: {output}; stderr: {stderr}')
+                f'{total_nodes}); output: {output}; stderr: {stderr}')
         except exceptions.FetchIPError:
             logger.debug(
                 f'Refreshing status ({cluster_name!r}) failed to get IPs.')