@@ -258,30 +258,6 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness):
258
258
MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
259
259
MODEL_PATH = f"{ llm_models_root ()} /llama-3.1-model/Llama-3.1-8B-Instruct"
260
260
261
- def test_nixl_backend (self ):
262
- ctx_server_config = {"cache_transceiver_config" : {"backend" : "nixl" }}
263
- gen_server_config = {"cache_transceiver_config" : {"backend" : "nixl" }}
264
- disaggregated_server_config = {
265
- "hostname" : "localhost" ,
266
- "port" : 8000 ,
267
- "backend" : "pytorch" ,
268
- "context_servers" : {
269
- "num_instances" : 1 ,
270
- "urls" : ["localhost:8001" ]
271
- },
272
- "generation_servers" : {
273
- "num_instances" : 1 ,
274
- "urls" : ["localhost:8002" ]
275
- }
276
- }
277
- with launch_disaggregated_llm (disaggregated_server_config ,
278
- ctx_server_config , gen_server_config ,
279
- self .MODEL_PATH ) as llm :
280
- task = MMLU (self .MODEL_NAME )
281
- task .evaluate (llm )
282
- task = GSM8K (self .MODEL_NAME )
283
- task .evaluate (llm )
284
-
285
261
@pytest .mark .skip_less_device_memory (32000 )
286
262
@pytest .mark .parametrize ("disable_overlap_scheduler" , [False , True ])
287
263
def test_auto_dtype (self , disable_overlap_scheduler ):
@@ -478,8 +454,18 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
478
454
MODEL_PATH = f"{ llm_models_root ()} /DeepSeek-V3-Lite/bf16"
479
455
480
456
def test_nixl_backend (self ):
481
- ctx_server_config = {"cache_transceiver_config" : {"backend" : "nixl" }}
482
- gen_server_config = {"cache_transceiver_config" : {"backend" : "nixl" }}
457
+ ctx_server_config = {
458
+ "disable_overlap_scheduler" : True ,
459
+ "cache_transceiver_config" : {
460
+ "backend" : "nixl"
461
+ }
462
+ }
463
+ gen_server_config = {
464
+ "disable_overlap_scheduler" : True ,
465
+ "cache_transceiver_config" : {
466
+ "backend" : "nixl"
467
+ }
468
+ }
483
469
disaggregated_server_config = {
484
470
"hostname" : "localhost" ,
485
471
"port" : 8000 ,
@@ -494,10 +480,8 @@ def test_nixl_backend(self):
494
480
}
495
481
}
496
482
with launch_disaggregated_llm (disaggregated_server_config ,
497
- ctx_server_config ,
498
- gen_server_config ,
499
- self .MODEL_PATH ,
500
- tensor_parallel_size = 4 ) as llm :
483
+ ctx_server_config , gen_server_config ,
484
+ self .MODEL_PATH ) as llm :
501
485
task = MMLU (self .MODEL_NAME )
502
486
task .evaluate (llm )
503
487
task = GSM8K (self .MODEL_NAME )
@@ -600,6 +584,42 @@ class TestQwen3_8B(LlmapiAccuracyTestHarness):
600
584
MODEL_NAME = "Qwen3/Qwen3-8B"
601
585
MODEL_PATH = f"{ llm_models_root ()} /Qwen3/Qwen3-8B-FP8"
602
586
587
+ def test_nixl_backend (self ):
588
+ ctx_server_config = {
589
+ "disable_overlap_scheduler" : True ,
590
+ "cache_transceiver_config" : {
591
+ "backend" : "nixl"
592
+ }
593
+ }
594
+ gen_server_config = {
595
+ "disable_overlap_scheduler" : True ,
596
+ "cache_transceiver_config" : {
597
+ "backend" : "nixl"
598
+ }
599
+ }
600
+ ctx_server_config ["cache_transceiver_config" ]
601
+ ctx_server_config ["cache_transceiver_config" ]
602
+ disaggregated_server_config = {
603
+ "hostname" : "localhost" ,
604
+ "port" : 8000 ,
605
+ "backend" : "pytorch" ,
606
+ "context_servers" : {
607
+ "num_instances" : 1 ,
608
+ "urls" : ["localhost:8001" ]
609
+ },
610
+ "generation_servers" : {
611
+ "num_instances" : 1 ,
612
+ "urls" : ["localhost:8002" ]
613
+ }
614
+ }
615
+ with launch_disaggregated_llm (disaggregated_server_config ,
616
+ ctx_server_config , gen_server_config ,
617
+ self .MODEL_PATH ) as llm :
618
+ task = MMLU (self .MODEL_NAME )
619
+ task .evaluate (llm )
620
+ task = GSM8K (self .MODEL_NAME )
621
+ task .evaluate (llm )
622
+
603
623
@pytest .mark .parametrize ("overlap_scheduler" , [False , True ])
604
624
def test_auto_dtype (self , overlap_scheduler ):
605
625
ctx_server_config = {
0 commit comments