From dec3619288ad7716da9ed5c1dcb1aa41c53c6b05 Mon Sep 17 00:00:00 2001
From: Andy Lee <andylizf@outlook.com>
Date: Sun, 27 Apr 2025 07:51:09 +0000
Subject: [PATCH 1/3] feat: k8s gateway

---
 sky/lbbench/gen_cmd.py       | 75 +++++++++++++++++++++++++++++-------
 sky/lbbench/queue_fetcher.py | 53 +++++++++++++++++++++++--
 2 files changed, 111 insertions(+), 17 deletions(-)

diff --git a/sky/lbbench/gen_cmd.py b/sky/lbbench/gen_cmd.py
index 4e0760ebf6b..d99f54cc956 100644
--- a/sky/lbbench/gen_cmd.py
+++ b/sky/lbbench/gen_cmd.py
@@ -6,7 +6,7 @@
 from pathlib import Path
 import shlex
 import tempfile
-from typing import List
+from typing import List, Optional
 
 from sky.lbbench import utils
 
@@ -17,6 +17,7 @@
     'sky_push_pull',
     'sky_push_push',
     'sky_pull_pull_rate_limit',
+    'gke_gateway',
 ]
 presents = [
     'Baseline',
@@ -25,19 +26,22 @@
     'Ours\\n[Push+Pull]',
     'Ours\\n[Push+Push]',
     'Ours\\n[Pull/RateLimit+Pull]',
+    'GKE Gateway',
 ]
 
-enabled_systems = [
+# Full list of systems indices - will be filtered by --run-systems
+all_systems = [
     0,  # sgl router
     1,  # sgl router enhanced
     2,  # sky pulling in lb, pulling in replica, but workload stealing
     3,  # sky pushing in lb, pulling in replica
     4,  # sky pushing in lb, pushing in replica
     5,  # sky pulling in lb, pulling in replica, but rate limit
+    6,  # gke
 ]
 
-describes = [describes[i] for i in enabled_systems]
-presents = [presents[i] for i in enabled_systems]
+# Default to just running GKE
+enabled_systems = [6]  # gke
 
 ct = None
 sn2st = None
@@ -53,7 +57,15 @@ def _get_head_ip_for_cluster(cluster: str) -> str:
     raise ValueError(f'Cluster {cluster} not found')
 
 
-def _get_endpoint_for_traffic(index: int, sns: List[str]) -> str:
+def _get_endpoint_for_traffic(index: int,
+                              sns: List[str],
+                              gke_endpoint: Optional[str] = None) -> str:
+    if index == 6:  # GKE Gateway
+        if gke_endpoint:
+            if not gke_endpoint.startswith(('http://', 'https://')):
+                return f'http://{gke_endpoint}'
+            return gke_endpoint
+        return 'http://34.117.239.237:80'  # Default GKE endpoint
     if index == 0:
         sgl_ip = _get_head_ip_for_cluster(utils.sgl_cluster)
         return f'{sgl_ip}:9001'
@@ -76,19 +88,55 @@ def _region_cluster_name(r: str) -> str:
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--service-names', type=str, nargs='+', required=True)
+    parser.add_argument(
+        '--service-names',
+        type=str,
+        nargs='*',
+        default=[],
+        help='Service names for SkyPilot services (indices 2-5)')
     parser.add_argument('--exp-name', type=str, required=True)
     parser.add_argument('--extra-args', type=str, default='')
     parser.add_argument('--output-dir', type=str, default='@temp')
     parser.add_argument('--regions', type=str, default=None, nargs='+')
     parser.add_argument('--region-to-args', type=str, default=None)
+    parser.add_argument('--gke-endpoint',
+                        type=str,
+                        default='34.117.239.237:80',
+                        help='GKE Gateway endpoint (IP:port)')
+    parser.add_argument(
+        '--run-systems',
+        type=int,
+        nargs='+',
+        default=[6],
+        help='Indices of systems to run (default: [6] for GKE only)')
     args = parser.parse_args()
-    sns = args.service_names
-    if len(sns) != len(describes):
-        raise ValueError(f'Expected {len(describes)} service names for '
-                         f'{", ".join(describes)}')
 
-    endpoints = [_get_endpoint_for_traffic(i, sns) for i in enabled_systems]
+    # Update enabled_systems based on --run-systems
+    global enabled_systems, describes, presents
+    enabled_systems = args.run_systems if args.run_systems else [
+        6
+    ]  # Default to GKE if not specified
+
+    # Filter describes and presents based on enabled_systems
+    describes = [describes[i] for i in enabled_systems]
+    presents = [presents[i] for i in enabled_systems]
+
+    # Only require service-names for SkyPilot systems (2-5)
+    sky_systems_count = sum(1 for s in enabled_systems if 2 <= s <= 5)
+    sns = args.service_names
+    if len(sns) != sky_systems_count:
+        if sky_systems_count > 0:
+            raise ValueError(
+                f'Expected {sky_systems_count} service names for SkyPilot')
+
+    # If no SkyPilot services needed, use empty list for non-sky systems
+    if sky_systems_count == 0:
+        sns = [''] * len(enabled_systems)
+
+    endpoints = [
+        _get_endpoint_for_traffic(i, sns, args.gke_endpoint)
+        for i in enabled_systems
+    ]
     print(endpoints)
     if any('None' in e for e in endpoints):
         raise ValueError('Some endpoints are not found')
@@ -212,8 +260,9 @@ def main():
         f.write('# Wait for queue status puller to initialize\n')
         f.write('echo "Waiting for queue status puller to initialize..."\n')
         f.write(f'echo "Check log file: tail -f {queue_status_file}"\n')
-        f.write('while ! grep -q "Pulling queue status" '
-                f'{queue_status_file}; do\n')
+        f.write(
+            'while ! grep -q "Pulling queue status\\|Skipping queue polling" '
+            f'{queue_status_file}; do\n')
         f.write('  sleep 1\n')
         f.write('  echo -n "."\n')
         f.write('done\n')
diff --git a/sky/lbbench/queue_fetcher.py b/sky/lbbench/queue_fetcher.py
index 32d3f43e4e2..7d95b30e4ff 100644
--- a/sky/lbbench/queue_fetcher.py
+++ b/sky/lbbench/queue_fetcher.py
@@ -5,6 +5,7 @@
 import json
 import os
 from pathlib import Path
+import shutil
 import tempfile
 import time
 from typing import Dict, List
@@ -18,12 +19,30 @@
 
 def prepare_lb_endpoints_and_confirm(exp2backend: Dict[str, str],
                                      yes: bool) -> Dict[str, List[str]]:
-    req = sky.serve.status(None)
-    st = sky.client.sdk.get(req)
-    print(sky.serve.format_service_table(st, show_all=False))
+    # Only fetch Serve status if there are SkyPilot Serve endpoints
+    st = []
+    needs_serve_status = any(
+        'aws.cblmemo.net' in url for url in exp2backend.values())
+
+    if needs_serve_status:
+        try:
+            req = sky.serve.status(None)
+            st = sky.client.sdk.get(req)
+            print(sky.serve.format_service_table(st, show_all=False))
+        except Exception as e:  # pylint: disable=broad-except
+            print(f'Warning: Could not fetch SkyPilot Serve status: {e}')
+            st = []
+    else:
+        print('No SkyPilot Serve endpoints found; skipping `sky serve status` '
+              'call.')
 
     def _get_one_endpoints(backend_url: str) -> List[str]:
         if 'aws.cblmemo.net' in backend_url:
+            if not st:
+                print(f'Warning: No SkyPilot Serve status available for '
+                      f'{backend_url}')
+                return []
+
             service_name = backend_url.split('.')[0]
             st4svc = None
             for svc in st:
@@ -40,6 +59,11 @@ def _get_one_endpoints(backend_url: str) -> List[str]:
             endpoints = [url]
         elif '9001' in backend_url:  # SGLang Router
             endpoints = [utils.sgl_cluster]
+        elif '34.117.239.237' in backend_url:  # GKE endpoint
+            url = backend_url
+            if not url.startswith('http://'):
+                url = 'http://' + url
+            endpoints = [url]
         else:
             return []
             # raise ValueError(f'Unknown backend URL: {backend_url}')
@@ -58,11 +82,32 @@ def _get_one_endpoints(backend_url: str) -> List[str]:
 
 async def pull_queue_status(exp_name: str, endpoints: List[str],
                             event: asyncio.Event, output_dir: str) -> None:
+    # Check if this is a GKE endpoint (34.117.239.237)
+    is_gke_baseline = False
+    if endpoints and any(
+            '34.117.239.237' in endpoint for endpoint in endpoints):
+        is_gke_baseline = True
+
     tmp_name = os.path.join(tempfile.gettempdir(),
                             f'result_queue_size_{exp_name}.txt')
     dest_name = (Path(output_dir).expanduser() / 'queue_size' /
                  f'{exp_name}.txt')
     dest_name.parent.mkdir(parents=True, exist_ok=True)
+
+    # If this is GKE, skip queue polling
+    if is_gke_baseline:
+        print(f'Skipping queue polling for GKE baseline: {exp_name}',
+              flush=True)
+        with open(dest_name, 'w', encoding='utf-8') as f:
+            f.write(
+                json.dumps({
+                    'time': time.time(),
+                    'status': 'skipped'
+                }) + '\n')
+        await event.wait()  # Wait for stop signal
+        print(f'Queue fetcher finished (skipped) for GKE: {exp_name}')
+        return
+
     # Force flush it to make the tee works
     print(f'Pulling queue status:      tail -f {tmp_name} | jq', flush=True)
     if utils.sgl_cluster in endpoints:
@@ -86,7 +131,7 @@ async def pull_queue_status(exp_name: str, endpoints: List[str],
                         lb2confs[endpoint] = conf
                     print(json.dumps(lb2confs), file=f, flush=True)
                     await asyncio.sleep(1)
-        os.rename(tmp_name, dest_name)
+        shutil.move(tmp_name, dest_name)
 
 
 def main():

From 046758e585f737432a64981d1a6c7c4cf2ad3a03 Mon Sep 17 00:00:00 2001
From: Andy Lee <andylizf@outlook.com>
Date: Mon, 28 Apr 2025 16:03:11 +0000
Subject: [PATCH 2/3] docs: readme

---
 sky/lbbench/README.md  | 238 +++++++++++++++++++++++++++++++++--------
 sky/lbbench/gen_cmd.py |  10 +-
 2 files changed, 198 insertions(+), 50 deletions(-)

diff --git a/sky/lbbench/README.md b/sky/lbbench/README.md
index 5f6bab226ac..8927135ee0d 100644
--- a/sky/lbbench/README.md
+++ b/sky/lbbench/README.md
@@ -21,6 +21,8 @@ rm -rf ~/.sky/.wheels_lock ~/.sky/wheels
 sky down sky-serve-controller-<user-hash>
 ```
 
+### A. Preparation for SkyPilot Services
+
 Cloning the plot script to the correct path:
 
 ```bash
@@ -40,13 +42,57 @@ export HF_TOKEN=<your-huggingface-token>
 
 This token should have access to `meta-llama/Llama-3.1-8B-Instruct` and `lmsys/chatbot_arena_conversations`.
 
+### B. GKE Environment Setup
+
+To use the GKE Multi-Cluster Gateway baseline, you need:
+
+1. `gcloud` and `kubectl` CLI tools with proper GCP project access
+2. Two GKE clusters in different regions, VPC-native, part of a Fleet, with Workload Identity enabled
+
+**Note:** The following steps assume GKE clusters named `sglang-us` and `sglang-asia` already exist and meet the requirements. If you need to create new clusters, use `gcloud container clusters create` with appropriate parameters including Fleet registration, Workload Identity enablement, VPC-native networking, and Gateway API enablement.
+
+Example clusters setup:
+- `sglang-us` in `us-central1` 
+- `sglang-asia` in `asia-northeast1`
+
+Set kubectl context environment variables (these will only be valid for your current shell session):
+```bash
+# Find your project ID if you don't know it
+gcloud config get-value project
+# List available contexts
+kubectl config get-contexts
+
+# Set context variables with your project ID
+export US_CONTEXT=gke_<your-project-id>_us-central1_sglang-us
+export ASIA_CONTEXT=gke_<your-project-id>_asia-northeast1_sglang-asia
+```
+
+Enable required GCP APIs:
+```bash
+# Core APIs (may already be enabled)
+gcloud services enable container.googleapis.com
+gcloud services enable compute.googleapis.com
+gcloud services enable gkehub.googleapis.com
+
+# Multi-cluster gateway APIs
+gcloud services enable multiclusterservicediscovery.googleapis.com
+gcloud services enable multiclusteringress.googleapis.com
+gcloud services enable trafficdirector.googleapis.com
+```
+
+Enable Gateway API on both clusters:
+```bash
+gcloud container clusters update sglang-us --location=us-central1 --gateway-api=standard
+gcloud container clusters update sglang-asia --location=asia-northeast1 --gateway-api=standard
+```
+
 ## Step 1: Launch Services
 
 Adjusting the service YAML (`examples/serve/external-lb/llm.yaml`) based on desired replica configuration. The default is 2 replicas in `us-east-2` and 2 replicas in `ap-northeast-1`. **All replicas will be launched in a round-robin fashion in the `ordered` region list**. e.g. if there is 3 regions and 4 replicas, the first region in the list will have 2 replicas and the other two regions will have 1 replica each. **All replicas should use AWS cloud for now**.
 
 When adding replicas to other regions, make sure to update the `external_load_balancers` section to add one load balancer for the new region. **All load balancers should use AWS cloud**. The `route53_hosted_zone` should be configured in the given credentials and no changes is needed - if you need to add a new one, please contact the author.
 
-Running the following command for 4 times.
+Running the following command for 6 times.
 
 - `svc1`: SGLang Router
 - `svc2`: SGLang Router [Pull]
@@ -70,7 +116,7 @@ sky serve up examples/serve/external-lb/llm.yaml -y -n svc6 --env HF_TOKEN --env
 Here is a easy-to-use script:
 
 ```bash
-PREFIX="svc" sky/lbbench/launch_systems.sh
+PREFIX="svc" bash sky/lbbench/launch_systems.sh
 ```
 
 Keep running `sky serve status -v` until all of them are ready (all replicas are ready):
@@ -78,52 +124,69 @@ Keep running `sky serve status -v` until all of them are ready (all replicas are
 ```bash
 $ sky serve status -v
 Services
-NAME  VERSION  UPTIME   STATUS  REPLICAS  EXTERNAL_LBS  ENDPOINT                   AUTOSCALING_POLICY  LOAD_BALANCING_POLICY  REQUESTED_RESOURCES  
-svc5  1        20m 12s  READY   4/4       2/2           svc5.aws.cblmemo.net:8000  Fixed 4 replicas    prefix_tree            1x[L4:1]             
-svc1  1        20m 40s  READY   4/4       2/2           svc1.aws.cblmemo.net:8000  Fixed 4 replicas    prefix_tree            1x[L4:1]             
-svc3  1        20m 20s  READY   4/4       2/2           svc3.aws.cblmemo.net:8000  Fixed 4 replicas    prefix_tree            1x[L4:1]             
-svc4  1        20m 36s  READY   4/4       2/2           svc4.aws.cblmemo.net:8000  Fixed 4 replicas    prefix_tree            1x[L4:1]             
-svc2  1        20m 45s  READY   4/4       2/2           svc2.aws.cblmemo.net:8000  Fixed 4 replicas    prefix_tree            1x[L4:1]             
+NAME  VERSION  UPTIME      STATUS  REPLICAS  EXTERNAL_LBS  ENDPOINT                   AUTOSCALING_POLICY  LOAD_BALANCING_POLICY  REQUESTED_RESOURCES  
+svc3  1        1h 40m 41s  READY   4/4       3/3           svc3.aws.cblmemo.net:8000  Fixed 4 replicas    prefix_tree            1x[L4:1]             
+svc6  1        1h 39m 1s   READY   4/4       3/3           svc6.aws.cblmemo.net:8000  Fixed 4 replicas    prefix_tree            1x[L4:1]             
+svc1  1        1h 41m 27s  READY   4/4       3/3           svc1.aws.cblmemo.net:8000  Fixed 4 replicas    prefix_tree            1x[L4:1]             
+svc5  1        1h 39m 26s  READY   4/4       3/3           svc5.aws.cblmemo.net:8000  Fixed 4 replicas    prefix_tree            1x[L4:1]             
+svc2  1        1h 41m 9s   READY   4/4       3/3           svc2.aws.cblmemo.net:8000  Fixed 4 replicas    prefix_tree            1x[L4:1]             
+svc4  1        1h 40m 8s   READY   4/4       3/3           svc4.aws.cblmemo.net:8000  Fixed 4 replicas    prefix_tree            1x[L4:1]             
 
 Service Replicas
-SERVICE_NAME  ID  VERSION  ENDPOINT                    LAUNCHED     RESOURCES                                                      STATUS  REGION          ZONE             
-svc5          1   1        http://43.207.115.174:8081  1 min ago    1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081'])  READY   ap-northeast-1  ap-northeast-1a  
-svc5          2   1        http://18.119.111.119:8081  21 mins ago  1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081'])  READY   us-east-2       us-east-2a       
-svc5          3   1        http://18.183.57.17:8081    1 min ago    1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081'])  READY   ap-northeast-1  ap-northeast-1c  
-svc5          4   1        http://3.15.150.242:8081    21 mins ago  1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081'])  READY   us-east-2       us-east-2a       
-svc1          1   1        http://54.178.80.208:8081   21 mins ago  1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081'])  READY   ap-northeast-1  ap-northeast-1a  
-svc1          2   1        http://3.137.168.179:8081   22 mins ago  1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081'])  READY   us-east-2       us-east-2a       
-svc1          3   1        http://43.207.108.244:8081  21 mins ago  1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081'])  READY   ap-northeast-1  ap-northeast-1a  
-svc1          4   1        http://3.144.37.203:8081    22 mins ago  1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081'])  READY   us-east-2       us-east-2a       
-svc3          1   1        http://18.183.179.168:8081  2 mins ago   1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081'])  READY   ap-northeast-1  ap-northeast-1a  
-svc3          2   1        http://18.116.42.193:8081   21 mins ago  1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081'])  READY   us-east-2       us-east-2a       
-svc3          3   1        http://43.206.151.166:8081  3 mins ago   1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081'])  READY   ap-northeast-1  ap-northeast-1c  
-svc3          4   1        http://18.218.48.79:8081    21 mins ago  1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081'])  READY   us-east-2       us-east-2a       
-svc4          1   1        http://18.181.198.138:8081  3 mins ago   1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081'])  READY   ap-northeast-1  ap-northeast-1a  
-svc4          2   1        http://18.118.37.42:8081    21 mins ago  1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081'])  READY   us-east-2       us-east-2a       
-svc4          3   1        http://43.207.182.196:8081  3 mins ago   1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081'])  READY   ap-northeast-1  ap-northeast-1c  
-svc4          4   1        http://3.137.166.123:8081   21 mins ago  1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081'])  READY   us-east-2       us-east-2a       
-svc2          1   1        http://18.179.9.124:8081    21 mins ago  1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081'])  READY   ap-northeast-1  ap-northeast-1a  
-svc2          2   1        http://3.128.31.100:8081    22 mins ago  1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081'])  READY   us-east-2       us-east-2a       
-svc2          3   1        http://52.193.151.185:8081  21 mins ago  1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081'])  READY   ap-northeast-1  ap-northeast-1a  
-svc2          4   1        http://3.15.229.172:8081    22 mins ago  1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081'])  READY   us-east-2       us-east-2a       
+SERVICE_NAME  ID  VERSION  ENDPOINT                    LAUNCHED  RESOURCES                                                      STATUS  REGION          ZONE             
+svc3          1   1        http://3.135.193.32:8081    1 hr ago  1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081'])  READY   us-east-2       us-east-2a       
+svc3          2   1        http://3.113.16.187:8081    1 hr ago  1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081'])  READY   ap-northeast-1  ap-northeast-1a  
+svc3          3   1        http://63.177.255.149:8081  1 hr ago  1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081'])  READY   eu-central-1    eu-central-1a    
+svc3          4   1        http://3.128.205.218:8081   1 hr ago  1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081'])  READY   us-east-2       us-east-2a       
+svc6          1   1        http://3.147.28.35:8081     1 hr ago  1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081'])  READY   us-east-2       us-east-2a       
+svc6          2   1        http://13.230.201.235:8081  1 hr ago  1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081'])  READY   ap-northeast-1  ap-northeast-1a  
+svc6          3   1        http://63.178.20.3:8081     1 hr ago  1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081'])  READY   eu-central-1    eu-central-1a    
+svc6          4   1        http://3.131.38.8:8081      1 hr ago  1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081'])  READY   us-east-2       us-east-2a       
+svc1          1   1        http://18.117.229.250:8081  1 hr ago  1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081'])  READY   us-east-2       us-east-2a       
+svc1          2   1        http://52.194.245.225:8081  1 hr ago  1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081'])  READY   ap-northeast-1  ap-northeast-1a  
+svc1          3   1        http://35.159.32.20:8081    1 hr ago  1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081'])  READY   eu-central-1    eu-central-1a    
+svc1          4   1        http://18.227.13.76:8081    1 hr ago  1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081'])  READY   us-east-2       us-east-2a       
+svc5          1   1        http://3.145.155.227:8081   1 hr ago  1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081'])  READY   us-east-2       us-east-2a       
+svc5          2   1        http://18.183.186.141:8081  1 hr ago  1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081'])  READY   ap-northeast-1  ap-northeast-1a  
+svc5          3   1        http://3.70.232.144:8081    1 hr ago  1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081'])  READY   eu-central-1    eu-central-1a    
+svc5          4   1        http://3.21.12.15:8081      1 hr ago  1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081'])  READY   us-east-2       us-east-2a       
+svc2          1   1        http://18.191.236.14:8081   1 hr ago  1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081'])  READY   us-east-2       us-east-2a       
+svc2          2   1        http://13.231.55.112:8081   1 hr ago  1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081'])  READY   ap-northeast-1  ap-northeast-1a  
+svc2          3   1        http://52.28.146.141:8081   1 hr ago  1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081'])  READY   eu-central-1    eu-central-1a    
+svc2          4   1        http://3.144.107.228:8081   1 hr ago  1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081'])  READY   us-east-2       us-east-2a       
+svc4          1   1        http://3.148.178.18:8081    1 hr ago  1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081'])  READY   us-east-2       us-east-2a       
+svc4          2   1        http://3.112.194.194:8081   1 hr ago  1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081'])  READY   ap-northeast-1  ap-northeast-1a  
+svc4          3   1        http://18.185.63.243:8081   1 hr ago  1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081'])  READY   eu-central-1    eu-central-1a    
+svc4          4   1        http://3.16.82.200:8081     1 hr ago  1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081'])  READY   us-east-2       us-east-2a       
 
 External Load Balancers
-SERVICE_NAME  ID  VERSION  ENDPOINT                    LAUNCHED     RESOURCES                          STATUS  REGION          ZONE             
-svc5          1   1        http://18.191.178.226:8000  21 mins ago  1x AWS(m6i.large, ports=['8000'])  READY   us-east-2       us-east-2a       
-svc5          2   1        http://52.195.182.20:8000   21 mins ago  1x AWS(m6i.large, ports=['8000'])  READY   ap-northeast-1  ap-northeast-1a  
-svc1          1   1        http://18.216.134.208:8000  22 mins ago  1x AWS(m6i.large, ports=['8000'])  READY   us-east-2       us-east-2a       
-svc1          2   1        http://13.231.55.224:8000   21 mins ago  1x AWS(m6i.large, ports=['8000'])  READY   ap-northeast-1  ap-northeast-1a  
-svc3          1   1        http://18.217.140.173:8000  21 mins ago  1x AWS(m6i.large, ports=['8000'])  READY   us-east-2       us-east-2a       
-svc3          2   1        http://13.231.3.176:8000    21 mins ago  1x AWS(m6i.large, ports=['8000'])  READY   ap-northeast-1  ap-northeast-1a  
-svc4          1   1        http://18.118.207.237:8000  21 mins ago  1x AWS(m6i.large, ports=['8000'])  READY   us-east-2       us-east-2a       
-svc4          2   1        http://52.194.190.7:8000    21 mins ago  1x AWS(m6i.large, ports=['8000'])  READY   ap-northeast-1  ap-northeast-1a  
-svc2          1   1        http://18.119.116.201:8000  22 mins ago  1x AWS(m6i.large, ports=['8000'])  READY   us-east-2       us-east-2a       
-svc2          2   1        http://3.112.123.186:8000   21 mins ago  1x AWS(m6i.large, ports=['8000'])  READY   ap-northeast-1  ap-northeast-1a 
+SERVICE_NAME  ID  VERSION  ENDPOINT                    LAUNCHED  RESOURCES                          STATUS  REGION          ZONE             
+svc3          1   1        http://13.58.2.218:8000     1 hr ago  1x AWS(m6i.large, ports=['8000'])  READY   us-east-2       us-east-2a       
+svc3          2   1        http://57.180.243.88:8000   1 hr ago  1x AWS(m6i.large, ports=['8000'])  READY   ap-northeast-1  ap-northeast-1a  
+svc3          3   1        http://3.75.194.104:8000    1 hr ago  1x AWS(m6i.large, ports=['8000'])  READY   eu-central-1    eu-central-1a    
+svc6          1   1        http://3.138.114.90:8000    1 hr ago  1x AWS(m6i.large, ports=['8000'])  READY   us-east-2       us-east-2a       
+svc6          2   1        http://54.65.4.162:8000     1 hr ago  1x AWS(m6i.large, ports=['8000'])  READY   ap-northeast-1  ap-northeast-1a  
+svc6          3   1        http://35.158.96.98:8000    1 hr ago  1x AWS(m6i.large, ports=['8000'])  READY   eu-central-1    eu-central-1a    
+svc1          1   1        http://13.58.116.247:8000   1 hr ago  1x AWS(m6i.large, ports=['8000'])  READY   us-east-2       us-east-2a       
+svc1          2   1        http://57.180.246.222:8000  1 hr ago  1x AWS(m6i.large, ports=['8000'])  READY   ap-northeast-1  ap-northeast-1a  
+svc1          3   1        http://3.71.203.38:8000     1 hr ago  1x AWS(m6i.large, ports=['8000'])  READY   eu-central-1    eu-central-1a    
+svc5          1   1        http://18.216.140.1:8000    1 hr ago  1x AWS(m6i.large, ports=['8000'])  READY   us-east-2       us-east-2a       
+svc5          2   1        http://3.113.11.160:8000    1 hr ago  1x AWS(m6i.large, ports=['8000'])  READY   ap-northeast-1  ap-northeast-1a  
+svc5          3   1        http://18.156.69.69:8000    1 hr ago  1x AWS(m6i.large, ports=['8000'])  READY   eu-central-1    eu-central-1a    
+svc2          1   1        http://18.224.73.161:8000   1 hr ago  1x AWS(m6i.large, ports=['8000'])  READY   us-east-2       us-east-2a       
+svc2          2   1        http://54.178.200.168:8000  1 hr ago  1x AWS(m6i.large, ports=['8000'])  READY   ap-northeast-1  ap-northeast-1a  
+svc2          3   1        http://18.185.177.96:8000   1 hr ago  1x AWS(m6i.large, ports=['8000'])  READY   eu-central-1    eu-central-1a    
+svc4          1   1        http://3.149.0.145:8000     1 hr ago  1x AWS(m6i.large, ports=['8000'])  READY   us-east-2       us-east-2a       
+svc4          2   1        http://3.112.47.105:8000    1 hr ago  1x AWS(m6i.large, ports=['8000'])  READY   ap-northeast-1  ap-northeast-1a  
+svc4          3   1        http://18.199.100.176:8000  1 hr ago  1x AWS(m6i.large, ports=['8000'])  READY   eu-central-1    eu-central-1a
 ```
 
+Note that the GKE baseline system doesn't need to use an service for testing.
+
 ## Step 2: Launch baseline load balancers
 
+### A. Launch SGLang Router Baselines (Indices 0, 1)
+
 We compare the performance of our load balancer with the following baselines:
 
 - SGLang Router
@@ -153,25 +216,111 @@ $ sky logs sgl-router-pull
 
 Make sure each load balancer has the desired number of replicas.
 
+### B. Setup/Verify GKE Gateway Baseline (Index 6)
+
+**Note:** If you have already completed the initial GKE Gateway setup and K8s resources haven't changed, you typically only need to execute **Step 6 (Ensure Deployment is running)** and **Step 5 (Verify Gateway status and IP)** at the end of this section. No need to reapply YAML files.
+
+#### Step 1: Enable Fleet features
+
+```bash
+# Enable Multi-cluster Services
+gcloud container fleet multi-cluster-services enable --project=<your-project-id>
+
+# Enable Multi-cluster Ingress with US as config cluster
+gcloud container fleet ingress enable --config-membership=projects/<your-project-id>/locations/global/memberships/sglang-us --project=<your-project-id>
+
+# Add IAM policy binding
+gcloud projects add-iam-policy-binding <your-project-id> \
+  --member "serviceAccount:service-<your-project-number>@gcp-sa-multiclusteringress.iam.gserviceaccount.com" \
+  --role "roles/container.admin"
+```
+
+**Note:** If you encounter issues with the `-mc` GatewayClass not being created, you may need to disable and re-enable the ingress feature.
+
+#### Step 2: Deploy application to GKE
+
+Apply the deployment YAML to both clusters:
+
+```bash
+kubectl apply -f examples/serve/external-lb/k8s/all.yaml --context=$US_CONTEXT
+kubectl apply -f examples/serve/external-lb/k8s/all.yaml --context=$ASIA_CONTEXT
+```
+
+#### Step 3: Export Services
+
+Export services to make them available across clusters:
+
+```bash
+kubectl apply -f examples/serve/external-lb/k8s/svc-export.yaml --context=$US_CONTEXT
+kubectl apply -f examples/serve/external-lb/k8s/svc-export.yaml --context=$ASIA_CONTEXT
+```
+
+#### Step 4: Deploy GKE Gateway resources
+
+Deploy Gateway resources in the config cluster:
+
+```bash
+kubectl apply -f examples/serve/external-lb/k8s/gateway.yaml --context=$US_CONTEXT
+kubectl apply -f examples/serve/external-lb/k8s/httproute.yaml --context=$US_CONTEXT
+kubectl apply -f examples/serve/external-lb/k8s/healthcheckpolicy.yaml --context=$US_CONTEXT
+```
+
+#### Step 5: Verify Gateway status and get IP
+
+Check gateway status and wait for an IP address:
+
+```bash
+kubectl get gateway sglang-external-gateway -n default --context=$US_CONTEXT --watch
+```
+
+When the gateway shows READY=True and has an IP address, export it:
+
+```bash
+export GKE_GATEWAY_IP=$(kubectl get gateway sglang-external-gateway -n default --context=$US_CONTEXT -o jsonpath='{.status.addresses[0].value}')
+echo "Gateway IP: $GKE_GATEWAY_IP"
+```
+
+#### Step 6: Ensure GKE Deployment is running
+
+Scale the deployment to the desired number of replicas in both clusters:
+
+```bash
+kubectl scale deployment sglang-deployment --replicas=2 -n default --context=$US_CONTEXT
+kubectl scale deployment sglang-deployment --replicas=2 -n default --context=$ASIA_CONTEXT
+```
+
+Verify the pods are running:
+
+```bash
+kubectl get pods -n default --context=$US_CONTEXT
+kubectl get pods -n default --context=$ASIA_CONTEXT
+```
+
 ## Step 3: Generate Bash Scripts to Use
 
 We have a util script to generate the benchmark commands. This doc will only cover the usage of multi-region clients, which means the requests will be simultaneously sent from multiple regions.
 
-**Notice that the service names should be the same order as the ones used in Step 2**.
+**Important:** The `--service-names` parameter must provide service names corresponding to the **SkyPilot services** selected in `--run-systems`. The number of names must match exactly, and they must be in the correct order. For example, if `--run-systems` includes 2 and 4, you need to provide two service names in `--service-names`, the first for system 2 and the second for system 4. If `--run-systems` doesn't include any indices from 2-5, then no service names are needed.
 
 Explanation of the arguments:
 
 - `--exp-name`: Identifier for the experiment. Please describe the experiment config in the name.
 - `--extra-args`: Workload specific arguments.
 - `--regions`: Client regions. This should be a list.
+- `--run-systems`: Indices of systems to run (0-6). Default is all systems.
+- `--gke-endpoint`: Endpoint of GKE Gateway (required if running system index 6).
+- `--service-names`: Names of SkyPilot services corresponding to indices 0-5 in `--run-systems`.
 
 **Notice that the `--extra-args` will be applied to all regions**. If you want a total concurrency of 300, you should set `--num-users (300 / num-regions)` for each region.
 
+### Running All Systems (including GKE)
+
 ```bash
-python3 -m sky.lbbench.gen_cmd --service-names svc1 svc2 svc3 svc4 svc5 \
-  --exp-name arena_syn_mrc_tail_c2000_u300_d240 \
+python3 -m sky.lbbench.gen_cmd --service-names svc1 svc2 svc3 svc4 svc5 svc6 \
+  --exp-name arena_syn_mrc_tail_c2000_u150_d240 \
   --extra-args '--workload arena_syn --duration 240 --num-conv 2000 --num-users 150' \
-  --regions us-east-2 ap-northeast-1
+  --regions us-east-2 ap-northeast-1 \
+  --gke-endpoint "$GKE_GATEWAY_IP:80"
 ```
 
 ### Side Note: Support for different configurations in different regions
@@ -216,6 +365,7 @@ Final step is to plot the results. You should see the following output from the
     'arena_syn_mrc_100_50_tail_c2000_u150_d240_sky_pull_pull': 'Ours\n[Pull+Pull]',
     'arena_syn_mrc_100_50_tail_c2000_u150_d240_sky_push_pull': 'Ours\n[Push+Pull]',
     'arena_syn_mrc_100_50_tail_c2000_u150_d240_sky_push_push': 'Ours\n[Push+Push]',
+    'arena_syn_mrc_100_50_tail_c2000_u150_d240_gke': 'GKE\nGateway',
 ```
 
 Copy-pasting them into the `gn2alias` variable in the `@temp/result/plot.py` script and run it. **Make sure to comment out other parts in the variable**.
diff --git a/sky/lbbench/gen_cmd.py b/sky/lbbench/gen_cmd.py
index 74bd8a85a1b..333e99a162e 100644
--- a/sky/lbbench/gen_cmd.py
+++ b/sky/lbbench/gen_cmd.py
@@ -107,22 +107,20 @@ def main():
         '--run-systems',
         type=int,
         nargs='+',
-        default=[6],
-        help='Indices of systems to run (default: [6] for GKE only)')
+        default=all_systems,
+        help='Indices of systems to run (default: all)')
     args = parser.parse_args()
 
     # Update enabled_systems based on --run-systems
     global enabled_systems, describes, presents
-    enabled_systems = args.run_systems if args.run_systems else [
-        6
-    ]  # Default to GKE if not specified
+    enabled_systems = args.run_systems
 
     # Filter describes and presents based on enabled_systems
     describes = [describes[i] for i in enabled_systems]
     presents = [presents[i] for i in enabled_systems]
 
     # Only require service-names for SkyPilot systems (2-5)
-    sky_systems_count = sum(1 for s in enabled_systems if 2 <= s <= 5)
+    sky_systems_count = sum(1 for s in enabled_systems if 0 <= s <= 5)
     sns = args.service_names
     if len(sns) != sky_systems_count:
         if sky_systems_count > 0:

From 39f70424bb4138299d24f29d5c64506160a6ed27 Mon Sep 17 00:00:00 2001
From: Andy Lee <andylizf@outlook.com>
Date: Mon, 28 Apr 2025 23:25:03 +0000
Subject: [PATCH 3/3] docs: scale down

---
 sky/lbbench/README.md | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/sky/lbbench/README.md b/sky/lbbench/README.md
index 8927135ee0d..6f24fd77139 100644
--- a/sky/lbbench/README.md
+++ b/sky/lbbench/README.md
@@ -40,6 +40,16 @@ Prepare your Hugging Face Token:
 export HF_TOKEN=<your-huggingface-token>
 ```
 
+To ensure the SkyServe controller has sufficient capacity to manage multiple services and replicas (up to 16 concurrent services) and can properly communicate with the load balancer, configure your `~/.sky/config.yaml` as follows:
+
+```yaml
+serve:
+  controller:
+    resources:
+      cloud: aws
+      cpus: 16+
+```
+
 This token should have access to `meta-llama/Llama-3.1-8B-Instruct` and `lmsys/chatbot_arena_conversations`.
 
 ### B. GKE Environment Setup
@@ -282,6 +292,13 @@ echo "Gateway IP: $GKE_GATEWAY_IP"
 
 #### Step 6: Ensure GKE Deployment is running
 
+Scale the node pool of the GKE cluster:
+
+```bash
+gcloud container clusters resize sglang-us --node-pool=default-pool --num-nodes=2 --region=us-central1 -q
+gcloud container clusters resize sglang-asia --node-pool=default-pool --num-nodes=2 --region=asia-northeast1 -q
+```
+
 Scale the deployment to the desired number of replicas in both clusters:
 
 ```bash
@@ -388,6 +405,14 @@ sky serve down -ay
 sky stop -ay
 # Or only cancel it if you want to keep using them in the next run
 sky cancel -ay sgl-router && sky cancel -ay sgl-router-pull
+
+kubectl scale deployment sglang-deployment --replicas=0 -n default --context=$US_CONTEXT
+kubectl scale deployment sglang-deployment --replicas=0 -n default --context=$ASIA_CONTEXT
+# Scale down GKE node pools completely
+# Note: This is critical as autoscaling alone may not fully release resources
+# due to system pods that keep nodes alive
+gcloud container clusters resize sglang-us --node-pool=default-pool --num-nodes=0 --region=us-central1 -q
+gcloud container clusters resize sglang-asia --node-pool=default-pool --num-nodes=0 --region=asia-northeast1 -q
 ```
 
 > The following content is stale. Don't need to check it.