From dec3619288ad7716da9ed5c1dcb1aa41c53c6b05 Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Sun, 27 Apr 2025 07:51:09 +0000 Subject: [PATCH 1/3] feat: k8s gateway --- sky/lbbench/gen_cmd.py | 75 +++++++++++++++++++++++++++++------- sky/lbbench/queue_fetcher.py | 53 +++++++++++++++++++++++-- 2 files changed, 111 insertions(+), 17 deletions(-) diff --git a/sky/lbbench/gen_cmd.py b/sky/lbbench/gen_cmd.py index 4e0760ebf6b..d99f54cc956 100644 --- a/sky/lbbench/gen_cmd.py +++ b/sky/lbbench/gen_cmd.py @@ -6,7 +6,7 @@ from pathlib import Path import shlex import tempfile -from typing import List +from typing import List, Optional from sky.lbbench import utils @@ -17,6 +17,7 @@ 'sky_push_pull', 'sky_push_push', 'sky_pull_pull_rate_limit', + 'gke_gateway', ] presents = [ 'Baseline', @@ -25,19 +26,22 @@ 'Ours\\n[Push+Pull]', 'Ours\\n[Push+Push]', 'Ours\\n[Pull/RateLimit+Pull]', + 'GKE Gateway', ] -enabled_systems = [ +# Full list of systems indices - will be filtered by --run-systems +all_systems = [ 0, # sgl router 1, # sgl router enhanced 2, # sky pulling in lb, pulling in replica, but workload stealing 3, # sky pushing in lb, pulling in replica 4, # sky pushing in lb, pushing in replica 5, # sky pulling in lb, pulling in replica, but rate limit + 6, # gke ] -describes = [describes[i] for i in enabled_systems] -presents = [presents[i] for i in enabled_systems] +# Default to just running GKE +enabled_systems = [6] # gke ct = None sn2st = None @@ -53,7 +57,15 @@ def _get_head_ip_for_cluster(cluster: str) -> str: raise ValueError(f'Cluster {cluster} not found') -def _get_endpoint_for_traffic(index: int, sns: List[str]) -> str: +def _get_endpoint_for_traffic(index: int, + sns: List[str], + gke_endpoint: Optional[str] = None) -> str: + if index == 6: # GKE Gateway + if gke_endpoint: + if not gke_endpoint.startswith(('http://', 'https://')): + return f'http://{gke_endpoint}' + return gke_endpoint + return 'http://34.117.239.237:80' # Default GKE endpoint if index == 0: sgl_ip = _get_head_ip_for_cluster(utils.sgl_cluster) return f'{sgl_ip}:9001' @@ -76,19 +88,55 @@ def _region_cluster_name(r: str) -> str: def main(): parser = argparse.ArgumentParser() - parser.add_argument('--service-names', type=str, nargs='+', required=True) + parser.add_argument( + '--service-names', + type=str, + nargs='*', + default=[], + help='Service names for SkyPilot services (indices 2-5)') parser.add_argument('--exp-name', type=str, required=True) parser.add_argument('--extra-args', type=str, default='') parser.add_argument('--output-dir', type=str, default='@temp') parser.add_argument('--regions', type=str, default=None, nargs='+') parser.add_argument('--region-to-args', type=str, default=None) + parser.add_argument('--gke-endpoint', + type=str, + default='34.117.239.237:80', + help='GKE Gateway endpoint (IP:port)') + parser.add_argument( + '--run-systems', + type=int, + nargs='+', + default=[6], + help='Indices of systems to run (default: [6] for GKE only)') args = parser.parse_args() - sns = args.service_names - if len(sns) != len(describes): - raise ValueError(f'Expected {len(describes)} service names for ' - f'{", ".join(describes)}') - endpoints = [_get_endpoint_for_traffic(i, sns) for i in enabled_systems] + # Update enabled_systems based on --run-systems + global enabled_systems, describes, presents + enabled_systems = args.run_systems if args.run_systems else [ + 6 + ] # Default to GKE if not specified + + # Filter describes and presents based on enabled_systems + describes = [describes[i] for i in enabled_systems] + presents = [presents[i] for i in enabled_systems] + + # Only require service-names for SkyPilot systems (2-5) + sky_systems_count = sum(1 for s in enabled_systems if 2 <= s <= 5) + sns = args.service_names + if len(sns) != sky_systems_count: + if sky_systems_count > 0: + raise ValueError( + f'Expected {sky_systems_count} service names for SkyPilot') + + # If no SkyPilot services needed, use empty list for non-sky systems + if sky_systems_count == 0: + sns = [''] * len(enabled_systems) + + endpoints = [ + _get_endpoint_for_traffic(i, sns, args.gke_endpoint) + for i in enabled_systems + ] print(endpoints) if any('None' in e for e in endpoints): raise ValueError('Some endpoints are not found') @@ -212,8 +260,9 @@ def main(): f.write('# Wait for queue status puller to initialize\n') f.write('echo "Waiting for queue status puller to initialize..."\n') f.write(f'echo "Check log file: tail -f {queue_status_file}"\n') - f.write('while ! grep -q "Pulling queue status" ' - f'{queue_status_file}; do\n') + f.write( + 'while ! grep -q "Pulling queue status\\|Skipping queue polling" ' + f'{queue_status_file}; do\n') f.write(' sleep 1\n') f.write(' echo -n "."\n') f.write('done\n') diff --git a/sky/lbbench/queue_fetcher.py b/sky/lbbench/queue_fetcher.py index 32d3f43e4e2..7d95b30e4ff 100644 --- a/sky/lbbench/queue_fetcher.py +++ b/sky/lbbench/queue_fetcher.py @@ -5,6 +5,7 @@ import json import os from pathlib import Path +import shutil import tempfile import time from typing import Dict, List @@ -18,12 +19,30 @@ def prepare_lb_endpoints_and_confirm(exp2backend: Dict[str, str], yes: bool) -> Dict[str, List[str]]: - req = sky.serve.status(None) - st = sky.client.sdk.get(req) - print(sky.serve.format_service_table(st, show_all=False)) + # Only fetch Serve status if there are SkyPilot Serve endpoints + st = [] + needs_serve_status = any( + 'aws.cblmemo.net' in url for url in exp2backend.values()) + + if needs_serve_status: + try: + req = sky.serve.status(None) + st = sky.client.sdk.get(req) + print(sky.serve.format_service_table(st, show_all=False)) + except Exception as e: # pylint: disable=broad-except + print(f'Warning: Could not fetch SkyPilot Serve status: {e}') + st = [] + else: + print('No SkyPilot Serve endpoints found; skipping `sky serve status` ' + 'call.') def _get_one_endpoints(backend_url: str) -> List[str]: if 'aws.cblmemo.net' in backend_url: + if not st: + print(f'Warning: No SkyPilot Serve status available for ' + f'{backend_url}') + return [] + service_name = backend_url.split('.')[0] st4svc = None for svc in st: @@ -40,6 +59,11 @@ def _get_one_endpoints(backend_url: str) -> List[str]: endpoints = [url] elif '9001' in backend_url: # SGLang Router endpoints = [utils.sgl_cluster] + elif '34.117.239.237' in backend_url: # GKE endpoint + url = backend_url + if not url.startswith('http://'): + url = 'http://' + url + endpoints = [url] else: return [] # raise ValueError(f'Unknown backend URL: {backend_url}') @@ -58,11 +82,32 @@ def _get_one_endpoints(backend_url: str) -> List[str]: async def pull_queue_status(exp_name: str, endpoints: List[str], event: asyncio.Event, output_dir: str) -> None: + # Check if this is a GKE endpoint (34.117.239.237) + is_gke_baseline = False + if endpoints and any( + '34.117.239.237' in endpoint for endpoint in endpoints): + is_gke_baseline = True + tmp_name = os.path.join(tempfile.gettempdir(), f'result_queue_size_{exp_name}.txt') dest_name = (Path(output_dir).expanduser() / 'queue_size' / f'{exp_name}.txt') dest_name.parent.mkdir(parents=True, exist_ok=True) + + # If this is GKE, skip queue polling + if is_gke_baseline: + print(f'Skipping queue polling for GKE baseline: {exp_name}', + flush=True) + with open(dest_name, 'w', encoding='utf-8') as f: + f.write( + json.dumps({ + 'time': time.time(), + 'status': 'skipped' + }) + '\n') + await event.wait() # Wait for stop signal + print(f'Queue fetcher finished (skipped) for GKE: {exp_name}') + return + # Force flush it to make the tee works print(f'Pulling queue status: tail -f {tmp_name} | jq', flush=True) if utils.sgl_cluster in endpoints: @@ -86,7 +131,7 @@ async def pull_queue_status(exp_name: str, endpoints: List[str], lb2confs[endpoint] = conf print(json.dumps(lb2confs), file=f, flush=True) await asyncio.sleep(1) - os.rename(tmp_name, dest_name) + shutil.move(tmp_name, dest_name) def main(): From 046758e585f737432a64981d1a6c7c4cf2ad3a03 Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Mon, 28 Apr 2025 16:03:11 +0000 Subject: [PATCH 2/3] docs: readme --- sky/lbbench/README.md | 238 +++++++++++++++++++++++++++++++++-------- sky/lbbench/gen_cmd.py | 10 +- 2 files changed, 198 insertions(+), 50 deletions(-) diff --git a/sky/lbbench/README.md b/sky/lbbench/README.md index 5f6bab226ac..8927135ee0d 100644 --- a/sky/lbbench/README.md +++ b/sky/lbbench/README.md @@ -21,6 +21,8 @@ rm -rf ~/.sky/.wheels_lock ~/.sky/wheels sky down sky-serve-controller- ``` +### A. Preparation for SkyPilot Services + Cloning the plot script to the correct path: ```bash @@ -40,13 +42,57 @@ export HF_TOKEN= This token should have access to `meta-llama/Llama-3.1-8B-Instruct` and `lmsys/chatbot_arena_conversations`. +### B. GKE Environment Setup + +To use the GKE Multi-Cluster Gateway baseline, you need: + +1. `gcloud` and `kubectl` CLI tools with proper GCP project access +2. Two GKE clusters in different regions, VPC-native, part of a Fleet, with Workload Identity enabled + +**Note:** The following steps assume GKE clusters named `sglang-us` and `sglang-asia` already exist and meet the requirements. If you need to create new clusters, use `gcloud container clusters create` with appropriate parameters including Fleet registration, Workload Identity enablement, VPC-native networking, and Gateway API enablement. + +Example clusters setup: +- `sglang-us` in `us-central1` +- `sglang-asia` in `asia-northeast1` + +Set kubectl context environment variables (these will only be valid for your current shell session): +```bash +# Find your project ID if you don't know it +gcloud config get-value project +# List available contexts +kubectl config get-contexts + +# Set context variables with your project ID +export US_CONTEXT=gke__us-central1_sglang-us +export ASIA_CONTEXT=gke__asia-northeast1_sglang-asia +``` + +Enable required GCP APIs: +```bash +# Core APIs (may already be enabled) +gcloud services enable container.googleapis.com +gcloud services enable compute.googleapis.com +gcloud services enable gkehub.googleapis.com + +# Multi-cluster gateway APIs +gcloud services enable multiclusterservicediscovery.googleapis.com +gcloud services enable multiclusteringress.googleapis.com +gcloud services enable trafficdirector.googleapis.com +``` + +Enable Gateway API on both clusters: +```bash +gcloud container clusters update sglang-us --location=us-central1 --gateway-api=standard +gcloud container clusters update sglang-asia --location=asia-northeast1 --gateway-api=standard +``` + ## Step 1: Launch Services Adjusting the service YAML (`examples/serve/external-lb/llm.yaml`) based on desired replica configuration. The default is 2 replicas in `us-east-2` and 2 replicas in `ap-northeast-1`. **All replicas will be launched in a round-robin fashion in the `ordered` region list**. e.g. if there is 3 regions and 4 replicas, the first region in the list will have 2 replicas and the other two regions will have 1 replica each. **All replicas should use AWS cloud for now**. When adding replicas to other regions, make sure to update the `external_load_balancers` section to add one load balancer for the new region. **All load balancers should use AWS cloud**. The `route53_hosted_zone` should be configured in the given credentials and no changes is needed - if you need to add a new one, please contact the author. -Running the following command for 4 times. +Running the following command for 6 times. - `svc1`: SGLang Router - `svc2`: SGLang Router [Pull] @@ -70,7 +116,7 @@ sky serve up examples/serve/external-lb/llm.yaml -y -n svc6 --env HF_TOKEN --env Here is a easy-to-use script: ```bash -PREFIX="svc" sky/lbbench/launch_systems.sh +PREFIX="svc" bash sky/lbbench/launch_systems.sh ``` Keep running `sky serve status -v` until all of them are ready (all replicas are ready): @@ -78,52 +124,69 @@ Keep running `sky serve status -v` until all of them are ready (all replicas are ```bash $ sky serve status -v Services -NAME VERSION UPTIME STATUS REPLICAS EXTERNAL_LBS ENDPOINT AUTOSCALING_POLICY LOAD_BALANCING_POLICY REQUESTED_RESOURCES -svc5 1 20m 12s READY 4/4 2/2 svc5.aws.cblmemo.net:8000 Fixed 4 replicas prefix_tree 1x[L4:1] -svc1 1 20m 40s READY 4/4 2/2 svc1.aws.cblmemo.net:8000 Fixed 4 replicas prefix_tree 1x[L4:1] -svc3 1 20m 20s READY 4/4 2/2 svc3.aws.cblmemo.net:8000 Fixed 4 replicas prefix_tree 1x[L4:1] -svc4 1 20m 36s READY 4/4 2/2 svc4.aws.cblmemo.net:8000 Fixed 4 replicas prefix_tree 1x[L4:1] -svc2 1 20m 45s READY 4/4 2/2 svc2.aws.cblmemo.net:8000 Fixed 4 replicas prefix_tree 1x[L4:1] +NAME VERSION UPTIME STATUS REPLICAS EXTERNAL_LBS ENDPOINT AUTOSCALING_POLICY LOAD_BALANCING_POLICY REQUESTED_RESOURCES +svc3 1 1h 40m 41s READY 4/4 3/3 svc3.aws.cblmemo.net:8000 Fixed 4 replicas prefix_tree 1x[L4:1] +svc6 1 1h 39m 1s READY 4/4 3/3 svc6.aws.cblmemo.net:8000 Fixed 4 replicas prefix_tree 1x[L4:1] +svc1 1 1h 41m 27s READY 4/4 3/3 svc1.aws.cblmemo.net:8000 Fixed 4 replicas prefix_tree 1x[L4:1] +svc5 1 1h 39m 26s READY 4/4 3/3 svc5.aws.cblmemo.net:8000 Fixed 4 replicas prefix_tree 1x[L4:1] +svc2 1 1h 41m 9s READY 4/4 3/3 svc2.aws.cblmemo.net:8000 Fixed 4 replicas prefix_tree 1x[L4:1] +svc4 1 1h 40m 8s READY 4/4 3/3 svc4.aws.cblmemo.net:8000 Fixed 4 replicas prefix_tree 1x[L4:1] Service Replicas -SERVICE_NAME ID VERSION ENDPOINT LAUNCHED RESOURCES STATUS REGION ZONE -svc5 1 1 http://43.207.115.174:8081 1 min ago 1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081']) READY ap-northeast-1 ap-northeast-1a -svc5 2 1 http://18.119.111.119:8081 21 mins ago 1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081']) READY us-east-2 us-east-2a -svc5 3 1 http://18.183.57.17:8081 1 min ago 1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081']) READY ap-northeast-1 ap-northeast-1c -svc5 4 1 http://3.15.150.242:8081 21 mins ago 1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081']) READY us-east-2 us-east-2a -svc1 1 1 http://54.178.80.208:8081 21 mins ago 1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081']) READY ap-northeast-1 ap-northeast-1a -svc1 2 1 http://3.137.168.179:8081 22 mins ago 1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081']) READY us-east-2 us-east-2a -svc1 3 1 http://43.207.108.244:8081 21 mins ago 1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081']) READY ap-northeast-1 ap-northeast-1a -svc1 4 1 http://3.144.37.203:8081 22 mins ago 1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081']) READY us-east-2 us-east-2a -svc3 1 1 http://18.183.179.168:8081 2 mins ago 1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081']) READY ap-northeast-1 ap-northeast-1a -svc3 2 1 http://18.116.42.193:8081 21 mins ago 1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081']) READY us-east-2 us-east-2a -svc3 3 1 http://43.206.151.166:8081 3 mins ago 1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081']) READY ap-northeast-1 ap-northeast-1c -svc3 4 1 http://18.218.48.79:8081 21 mins ago 1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081']) READY us-east-2 us-east-2a -svc4 1 1 http://18.181.198.138:8081 3 mins ago 1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081']) READY ap-northeast-1 ap-northeast-1a -svc4 2 1 http://18.118.37.42:8081 21 mins ago 1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081']) READY us-east-2 us-east-2a -svc4 3 1 http://43.207.182.196:8081 3 mins ago 1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081']) READY ap-northeast-1 ap-northeast-1c -svc4 4 1 http://3.137.166.123:8081 21 mins ago 1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081']) READY us-east-2 us-east-2a -svc2 1 1 http://18.179.9.124:8081 21 mins ago 1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081']) READY ap-northeast-1 ap-northeast-1a -svc2 2 1 http://3.128.31.100:8081 22 mins ago 1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081']) READY us-east-2 us-east-2a -svc2 3 1 http://52.193.151.185:8081 21 mins ago 1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081']) READY ap-northeast-1 ap-northeast-1a -svc2 4 1 http://3.15.229.172:8081 22 mins ago 1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081']) READY us-east-2 us-east-2a +SERVICE_NAME ID VERSION ENDPOINT LAUNCHED RESOURCES STATUS REGION ZONE +svc3 1 1 http://3.135.193.32:8081 1 hr ago 1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081']) READY us-east-2 us-east-2a +svc3 2 1 http://3.113.16.187:8081 1 hr ago 1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081']) READY ap-northeast-1 ap-northeast-1a +svc3 3 1 http://63.177.255.149:8081 1 hr ago 1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081']) READY eu-central-1 eu-central-1a +svc3 4 1 http://3.128.205.218:8081 1 hr ago 1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081']) READY us-east-2 us-east-2a +svc6 1 1 http://3.147.28.35:8081 1 hr ago 1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081']) READY us-east-2 us-east-2a +svc6 2 1 http://13.230.201.235:8081 1 hr ago 1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081']) READY ap-northeast-1 ap-northeast-1a +svc6 3 1 http://63.178.20.3:8081 1 hr ago 1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081']) READY eu-central-1 eu-central-1a +svc6 4 1 http://3.131.38.8:8081 1 hr ago 1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081']) READY us-east-2 us-east-2a +svc1 1 1 http://18.117.229.250:8081 1 hr ago 1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081']) READY us-east-2 us-east-2a +svc1 2 1 http://52.194.245.225:8081 1 hr ago 1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081']) READY ap-northeast-1 ap-northeast-1a +svc1 3 1 http://35.159.32.20:8081 1 hr ago 1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081']) READY eu-central-1 eu-central-1a +svc1 4 1 http://18.227.13.76:8081 1 hr ago 1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081']) READY us-east-2 us-east-2a +svc5 1 1 http://3.145.155.227:8081 1 hr ago 1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081']) READY us-east-2 us-east-2a +svc5 2 1 http://18.183.186.141:8081 1 hr ago 1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081']) READY ap-northeast-1 ap-northeast-1a +svc5 3 1 http://3.70.232.144:8081 1 hr ago 1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081']) READY eu-central-1 eu-central-1a +svc5 4 1 http://3.21.12.15:8081 1 hr ago 1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081']) READY us-east-2 us-east-2a +svc2 1 1 http://18.191.236.14:8081 1 hr ago 1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081']) READY us-east-2 us-east-2a +svc2 2 1 http://13.231.55.112:8081 1 hr ago 1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081']) READY ap-northeast-1 ap-northeast-1a +svc2 3 1 http://52.28.146.141:8081 1 hr ago 1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081']) READY eu-central-1 eu-central-1a +svc2 4 1 http://3.144.107.228:8081 1 hr ago 1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081']) READY us-east-2 us-east-2a +svc4 1 1 http://3.148.178.18:8081 1 hr ago 1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081']) READY us-east-2 us-east-2a +svc4 2 1 http://3.112.194.194:8081 1 hr ago 1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081']) READY ap-northeast-1 ap-northeast-1a +svc4 3 1 http://18.185.63.243:8081 1 hr ago 1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081']) READY eu-central-1 eu-central-1a +svc4 4 1 http://3.16.82.200:8081 1 hr ago 1x AWS(g6.4xlarge, {'L4': 1}, disk_tier=high, ports=['8081']) READY us-east-2 us-east-2a External Load Balancers -SERVICE_NAME ID VERSION ENDPOINT LAUNCHED RESOURCES STATUS REGION ZONE -svc5 1 1 http://18.191.178.226:8000 21 mins ago 1x AWS(m6i.large, ports=['8000']) READY us-east-2 us-east-2a -svc5 2 1 http://52.195.182.20:8000 21 mins ago 1x AWS(m6i.large, ports=['8000']) READY ap-northeast-1 ap-northeast-1a -svc1 1 1 http://18.216.134.208:8000 22 mins ago 1x AWS(m6i.large, ports=['8000']) READY us-east-2 us-east-2a -svc1 2 1 http://13.231.55.224:8000 21 mins ago 1x AWS(m6i.large, ports=['8000']) READY ap-northeast-1 ap-northeast-1a -svc3 1 1 http://18.217.140.173:8000 21 mins ago 1x AWS(m6i.large, ports=['8000']) READY us-east-2 us-east-2a -svc3 2 1 http://13.231.3.176:8000 21 mins ago 1x AWS(m6i.large, ports=['8000']) READY ap-northeast-1 ap-northeast-1a -svc4 1 1 http://18.118.207.237:8000 21 mins ago 1x AWS(m6i.large, ports=['8000']) READY us-east-2 us-east-2a -svc4 2 1 http://52.194.190.7:8000 21 mins ago 1x AWS(m6i.large, ports=['8000']) READY ap-northeast-1 ap-northeast-1a -svc2 1 1 http://18.119.116.201:8000 22 mins ago 1x AWS(m6i.large, ports=['8000']) READY us-east-2 us-east-2a -svc2 2 1 http://3.112.123.186:8000 21 mins ago 1x AWS(m6i.large, ports=['8000']) READY ap-northeast-1 ap-northeast-1a +SERVICE_NAME ID VERSION ENDPOINT LAUNCHED RESOURCES STATUS REGION ZONE +svc3 1 1 http://13.58.2.218:8000 1 hr ago 1x AWS(m6i.large, ports=['8000']) READY us-east-2 us-east-2a +svc3 2 1 http://57.180.243.88:8000 1 hr ago 1x AWS(m6i.large, ports=['8000']) READY ap-northeast-1 ap-northeast-1a +svc3 3 1 http://3.75.194.104:8000 1 hr ago 1x AWS(m6i.large, ports=['8000']) READY eu-central-1 eu-central-1a +svc6 1 1 http://3.138.114.90:8000 1 hr ago 1x AWS(m6i.large, ports=['8000']) READY us-east-2 us-east-2a +svc6 2 1 http://54.65.4.162:8000 1 hr ago 1x AWS(m6i.large, ports=['8000']) READY ap-northeast-1 ap-northeast-1a +svc6 3 1 http://35.158.96.98:8000 1 hr ago 1x AWS(m6i.large, ports=['8000']) READY eu-central-1 eu-central-1a +svc1 1 1 http://13.58.116.247:8000 1 hr ago 1x AWS(m6i.large, ports=['8000']) READY us-east-2 us-east-2a +svc1 2 1 http://57.180.246.222:8000 1 hr ago 1x AWS(m6i.large, ports=['8000']) READY ap-northeast-1 ap-northeast-1a +svc1 3 1 http://3.71.203.38:8000 1 hr ago 1x AWS(m6i.large, ports=['8000']) READY eu-central-1 eu-central-1a +svc5 1 1 http://18.216.140.1:8000 1 hr ago 1x AWS(m6i.large, ports=['8000']) READY us-east-2 us-east-2a +svc5 2 1 http://3.113.11.160:8000 1 hr ago 1x AWS(m6i.large, ports=['8000']) READY ap-northeast-1 ap-northeast-1a +svc5 3 1 http://18.156.69.69:8000 1 hr ago 1x AWS(m6i.large, ports=['8000']) READY eu-central-1 eu-central-1a +svc2 1 1 http://18.224.73.161:8000 1 hr ago 1x AWS(m6i.large, ports=['8000']) READY us-east-2 us-east-2a +svc2 2 1 http://54.178.200.168:8000 1 hr ago 1x AWS(m6i.large, ports=['8000']) READY ap-northeast-1 ap-northeast-1a +svc2 3 1 http://18.185.177.96:8000 1 hr ago 1x AWS(m6i.large, ports=['8000']) READY eu-central-1 eu-central-1a +svc4 1 1 http://3.149.0.145:8000 1 hr ago 1x AWS(m6i.large, ports=['8000']) READY us-east-2 us-east-2a +svc4 2 1 http://3.112.47.105:8000 1 hr ago 1x AWS(m6i.large, ports=['8000']) READY ap-northeast-1 ap-northeast-1a +svc4 3 1 http://18.199.100.176:8000 1 hr ago 1x AWS(m6i.large, ports=['8000']) READY eu-central-1 eu-central-1a ``` +Note that the GKE baseline system doesn't need to use an service for testing. + ## Step 2: Launch baseline load balancers +### A. Launch SGLang Router Baselines (Indices 0, 1) + We compare the performance of our load balancer with the following baselines: - SGLang Router @@ -153,25 +216,111 @@ $ sky logs sgl-router-pull Make sure each load balancer has the desired number of replicas. +### B. Setup/Verify GKE Gateway Baseline (Index 6) + +**Note:** If you have already completed the initial GKE Gateway setup and K8s resources haven't changed, you typically only need to execute **Step 6 (Ensure Deployment is running)** and **Step 5 (Verify Gateway status and IP)** at the end of this section. No need to reapply YAML files. + +#### Step 1: Enable Fleet features + +```bash +# Enable Multi-cluster Services +gcloud container fleet multi-cluster-services enable --project= + +# Enable Multi-cluster Ingress with US as config cluster +gcloud container fleet ingress enable --config-membership=projects//locations/global/memberships/sglang-us --project= + +# Add IAM policy binding +gcloud projects add-iam-policy-binding \ + --member "serviceAccount:service-@gcp-sa-multiclusteringress.iam.gserviceaccount.com" \ + --role "roles/container.admin" +``` + +**Note:** If you encounter issues with the `-mc` GatewayClass not being created, you may need to disable and re-enable the ingress feature. + +#### Step 2: Deploy application to GKE + +Apply the deployment YAML to both clusters: + +```bash +kubectl apply -f examples/serve/external-lb/k8s/all.yaml --context=$US_CONTEXT +kubectl apply -f examples/serve/external-lb/k8s/all.yaml --context=$ASIA_CONTEXT +``` + +#### Step 3: Export Services + +Export services to make them available across clusters: + +```bash +kubectl apply -f examples/serve/external-lb/k8s/svc-export.yaml --context=$US_CONTEXT +kubectl apply -f examples/serve/external-lb/k8s/svc-export.yaml --context=$ASIA_CONTEXT +``` + +#### Step 4: Deploy GKE Gateway resources + +Deploy Gateway resources in the config cluster: + +```bash +kubectl apply -f examples/serve/external-lb/k8s/gateway.yaml --context=$US_CONTEXT +kubectl apply -f examples/serve/external-lb/k8s/httproute.yaml --context=$US_CONTEXT +kubectl apply -f examples/serve/external-lb/k8s/healthcheckpolicy.yaml --context=$US_CONTEXT +``` + +#### Step 5: Verify Gateway status and get IP + +Check gateway status and wait for an IP address: + +```bash +kubectl get gateway sglang-external-gateway -n default --context=$US_CONTEXT --watch +``` + +When the gateway shows READY=True and has an IP address, export it: + +```bash +export GKE_GATEWAY_IP=$(kubectl get gateway sglang-external-gateway -n default --context=$US_CONTEXT -o jsonpath='{.status.addresses[0].value}') +echo "Gateway IP: $GKE_GATEWAY_IP" +``` + +#### Step 6: Ensure GKE Deployment is running + +Scale the deployment to the desired number of replicas in both clusters: + +```bash +kubectl scale deployment sglang-deployment --replicas=2 -n default --context=$US_CONTEXT +kubectl scale deployment sglang-deployment --replicas=2 -n default --context=$ASIA_CONTEXT +``` + +Verify the pods are running: + +```bash +kubectl get pods -n default --context=$US_CONTEXT +kubectl get pods -n default --context=$ASIA_CONTEXT +``` + ## Step 3: Generate Bash Scripts to Use We have a util script to generate the benchmark commands. This doc will only cover the usage of multi-region clients, which means the requests will be simultaneously sent from multiple regions. -**Notice that the service names should be the same order as the ones used in Step 2**. +**Important:** The `--service-names` parameter must provide service names corresponding to the **SkyPilot services** selected in `--run-systems`. The number of names must match exactly, and they must be in the correct order. For example, if `--run-systems` includes 2 and 4, you need to provide two service names in `--service-names`, the first for system 2 and the second for system 4. If `--run-systems` doesn't include any indices from 2-5, then no service names are needed. Explanation of the arguments: - `--exp-name`: Identifier for the experiment. Please describe the experiment config in the name. - `--extra-args`: Workload specific arguments. - `--regions`: Client regions. This should be a list. +- `--run-systems`: Indices of systems to run (0-6). Default is all systems. +- `--gke-endpoint`: Endpoint of GKE Gateway (required if running system index 6). +- `--service-names`: Names of SkyPilot services corresponding to indices 0-5 in `--run-systems`. **Notice that the `--extra-args` will be applied to all regions**. If you want a total concurrency of 300, you should set `--num-users (300 / num-regions)` for each region. +### Running All Systems (including GKE) + ```bash -python3 -m sky.lbbench.gen_cmd --service-names svc1 svc2 svc3 svc4 svc5 \ - --exp-name arena_syn_mrc_tail_c2000_u300_d240 \ +python3 -m sky.lbbench.gen_cmd --service-names svc1 svc2 svc3 svc4 svc5 svc6 \ + --exp-name arena_syn_mrc_tail_c2000_u150_d240 \ --extra-args '--workload arena_syn --duration 240 --num-conv 2000 --num-users 150' \ - --regions us-east-2 ap-northeast-1 + --regions us-east-2 ap-northeast-1 \ + --gke-endpoint "$GKE_GATEWAY_IP:80" ``` ### Side Note: Support for different configurations in different regions @@ -216,6 +365,7 @@ Final step is to plot the results. You should see the following output from the 'arena_syn_mrc_100_50_tail_c2000_u150_d240_sky_pull_pull': 'Ours\n[Pull+Pull]', 'arena_syn_mrc_100_50_tail_c2000_u150_d240_sky_push_pull': 'Ours\n[Push+Pull]', 'arena_syn_mrc_100_50_tail_c2000_u150_d240_sky_push_push': 'Ours\n[Push+Push]', + 'arena_syn_mrc_100_50_tail_c2000_u150_d240_gke': 'GKE\nGateway', ``` Copy-pasting them into the `gn2alias` variable in the `@temp/result/plot.py` script and run it. **Make sure to comment out other parts in the variable**. diff --git a/sky/lbbench/gen_cmd.py b/sky/lbbench/gen_cmd.py index 74bd8a85a1b..333e99a162e 100644 --- a/sky/lbbench/gen_cmd.py +++ b/sky/lbbench/gen_cmd.py @@ -107,22 +107,20 @@ def main(): '--run-systems', type=int, nargs='+', - default=[6], - help='Indices of systems to run (default: [6] for GKE only)') + default=all_systems, + help='Indices of systems to run (default: all)') args = parser.parse_args() # Update enabled_systems based on --run-systems global enabled_systems, describes, presents - enabled_systems = args.run_systems if args.run_systems else [ - 6 - ] # Default to GKE if not specified + enabled_systems = args.run_systems # Filter describes and presents based on enabled_systems describes = [describes[i] for i in enabled_systems] presents = [presents[i] for i in enabled_systems] # Only require service-names for SkyPilot systems (2-5) - sky_systems_count = sum(1 for s in enabled_systems if 2 <= s <= 5) + sky_systems_count = sum(1 for s in enabled_systems if 0 <= s <= 5) sns = args.service_names if len(sns) != sky_systems_count: if sky_systems_count > 0: From 39f70424bb4138299d24f29d5c64506160a6ed27 Mon Sep 17 00:00:00 2001 From: Andy Lee Date: Mon, 28 Apr 2025 23:25:03 +0000 Subject: [PATCH 3/3] docs: scale down --- sky/lbbench/README.md | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/sky/lbbench/README.md b/sky/lbbench/README.md index 8927135ee0d..6f24fd77139 100644 --- a/sky/lbbench/README.md +++ b/sky/lbbench/README.md @@ -40,6 +40,16 @@ Prepare your Hugging Face Token: export HF_TOKEN= ``` +To ensure the SkyServe controller has sufficient capacity to manage multiple services and replicas (up to 16 concurrent services) and can properly communicate with the load balancer, configure your `~/.sky/config.yaml` as follows: + +```yaml +serve: + controller: + resources: + cloud: aws + cpus: 16+ +``` + This token should have access to `meta-llama/Llama-3.1-8B-Instruct` and `lmsys/chatbot_arena_conversations`. ### B. GKE Environment Setup @@ -282,6 +292,13 @@ echo "Gateway IP: $GKE_GATEWAY_IP" #### Step 6: Ensure GKE Deployment is running +Scale the node pool of the GKE cluster: + +```bash +gcloud container clusters resize sglang-us --node-pool=default-pool --num-nodes=2 --region=us-central1 -q +gcloud container clusters resize sglang-asia --node-pool=default-pool --num-nodes=2 --region=asia-northeast1 -q +``` + Scale the deployment to the desired number of replicas in both clusters: ```bash @@ -388,6 +405,14 @@ sky serve down -ay sky stop -ay # Or only cancel it if you want to keep using them in the next run sky cancel -ay sgl-router && sky cancel -ay sgl-router-pull + +kubectl scale deployment sglang-deployment --replicas=0 -n default --context=$US_CONTEXT +kubectl scale deployment sglang-deployment --replicas=0 -n default --context=$ASIA_CONTEXT +# Scale down GKE node pools completely +# Note: This is critical as autoscaling alone may not fully release resources +# due to system pods that keep nodes alive +gcloud container clusters resize sglang-us --node-pool=default-pool --num-nodes=0 --region=us-central1 -q +gcloud container clusters resize sglang-asia --node-pool=default-pool --num-nodes=0 --region=asia-northeast1 -q ``` > The following content is stale. Don't need to check it.