@@ -499,7 +499,7 @@ def test_eagle3_tp8(self, eagle3_one_model):
499
499
@pytest .mark .skip_less_device (4 )
500
500
@skip_pre_hopper
501
501
def test_fp8_tp4 (self ):
502
- model_path = f"{ llm_models_root ()} /modelopt-hf-model-hub /Llama-3.3-70B-Instruct-fp8 "
502
+ model_path = f"{ llm_models_root ()} /llama-3.3-models /Llama-3.3-70B-Instruct-FP8 "
503
503
kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.5 )
504
504
with LLM (model_path ,
505
505
tensor_parallel_size = 4 ,
@@ -508,6 +508,7 @@ def test_fp8_tp4(self):
508
508
kv_cache_config = kv_cache_config ) as llm :
509
509
assert llm .args .quant_config .quant_algo == QuantAlgo .FP8
510
510
sampling_params = SamplingParams (
511
+ max_tokens = 256 ,
511
512
temperature = 0.0 ,
512
513
add_special_tokens = False ,
513
514
)
@@ -517,16 +518,20 @@ def test_fp8_tp4(self):
517
518
task .evaluate (llm , sampling_params = sampling_params )
518
519
task = GPQADiamond (self .MODEL_NAME )
519
520
task .evaluate (llm ,
520
- sampling_params = sampling_params ,
521
521
extra_evaluator_kwargs = dict (apply_chat_template = True ))
522
522
523
523
@pytest .mark .skip_less_device (4 )
524
524
@skip_pre_blackwell
525
525
def test_nvfp4_tp4 (self ):
526
- model_path = f"{ llm_models_root ()} /modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp4"
527
- with LLM (model_path , tensor_parallel_size = 4 ) as llm :
526
+ model_path = f"{ llm_models_root ()} /llama-3.3-models/Llama-3.3-70B-Instruct-FP4"
527
+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.5 )
528
+ with LLM (model_path ,
529
+ tensor_parallel_size = 4 ,
530
+ max_batch_size = 32 ,
531
+ kv_cache_config = kv_cache_config ) as llm :
528
532
assert llm .args .quant_config .quant_algo == QuantAlgo .NVFP4
529
533
sampling_params = SamplingParams (
534
+ max_tokens = 256 ,
530
535
temperature = 0.0 ,
531
536
add_special_tokens = False ,
532
537
)
@@ -536,7 +541,6 @@ def test_nvfp4_tp4(self):
536
541
task .evaluate (llm , sampling_params = sampling_params )
537
542
task = GPQADiamond (self .MODEL_NAME )
538
543
task .evaluate (llm ,
539
- sampling_params = sampling_params ,
540
544
extra_evaluator_kwargs = dict (apply_chat_template = True ))
541
545
542
546
0 commit comments