Skip to content

Commit 6a44e5b

Browse files
authored
[https://nvbugs/5440241][fix] Fix 70B GSM8K Accuracy drop (#6967)
Signed-off-by: Chenfei Zhang <[email protected]>
1 parent 200db3b commit 6a44e5b

File tree

3 files changed

+15
-11
lines changed

3 files changed

+15
-11
lines changed

tests/integration/defs/accuracy/references/gsm8k.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,12 @@ meta-llama/Llama-3.3-70B-Instruct:
1313
- accuracy: 83.78
1414
- quant_algo: NVFP4
1515
kv_cache_quant_algo: FP8
16-
accuracy: 88.70
16+
accuracy: 87.33
1717
- quant_algo: FP8
1818
kv_cache_quant_algo: FP8
19-
accuracy: 84.08
19+
accuracy: 90.30
2020
- quant_algo: FP8
21-
accuracy: 84.08
21+
accuracy: 90.30
2222
meta-llama/Llama-4-Maverick-17B-128E-Instruct:
2323
- accuracy: 92.20
2424
- quant_algo: FP8

tests/integration/defs/accuracy/references/mmlu.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,12 +63,12 @@ meta-llama/Llama-3.3-70B-Instruct:
6363
accuracy: 81.31
6464
- quant_algo: NVFP4
6565
kv_cache_quant_algo: FP8
66-
accuracy: 79.31
66+
accuracy: 78.78
6767
- quant_algo: FP8
6868
kv_cache_quant_algo: FP8
69-
accuracy: 81.02
69+
accuracy: 80.40
7070
- quant_algo: FP8
71-
accuracy: 80.34
71+
accuracy: 80.40
7272
meta-llama/Llama-4-Maverick-17B-128E-Instruct:
7373
- accuracy: 86.40
7474
- quant_algo: FP8

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -499,7 +499,7 @@ def test_eagle3_tp8(self, eagle3_one_model):
499499
@pytest.mark.skip_less_device(4)
500500
@skip_pre_hopper
501501
def test_fp8_tp4(self):
502-
model_path = f"{llm_models_root()}/modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp8"
502+
model_path = f"{llm_models_root()}/llama-3.3-models/Llama-3.3-70B-Instruct-FP8"
503503
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)
504504
with LLM(model_path,
505505
tensor_parallel_size=4,
@@ -508,6 +508,7 @@ def test_fp8_tp4(self):
508508
kv_cache_config=kv_cache_config) as llm:
509509
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
510510
sampling_params = SamplingParams(
511+
max_tokens=256,
511512
temperature=0.0,
512513
add_special_tokens=False,
513514
)
@@ -517,16 +518,20 @@ def test_fp8_tp4(self):
517518
task.evaluate(llm, sampling_params=sampling_params)
518519
task = GPQADiamond(self.MODEL_NAME)
519520
task.evaluate(llm,
520-
sampling_params=sampling_params,
521521
extra_evaluator_kwargs=dict(apply_chat_template=True))
522522

523523
@pytest.mark.skip_less_device(4)
524524
@skip_pre_blackwell
525525
def test_nvfp4_tp4(self):
526-
model_path = f"{llm_models_root()}/modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp4"
527-
with LLM(model_path, tensor_parallel_size=4) as llm:
526+
model_path = f"{llm_models_root()}/llama-3.3-models/Llama-3.3-70B-Instruct-FP4"
527+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)
528+
with LLM(model_path,
529+
tensor_parallel_size=4,
530+
max_batch_size=32,
531+
kv_cache_config=kv_cache_config) as llm:
528532
assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
529533
sampling_params = SamplingParams(
534+
max_tokens=256,
530535
temperature=0.0,
531536
add_special_tokens=False,
532537
)
@@ -536,7 +541,6 @@ def test_nvfp4_tp4(self):
536541
task.evaluate(llm, sampling_params=sampling_params)
537542
task = GPQADiamond(self.MODEL_NAME)
538543
task.evaluate(llm,
539-
sampling_params=sampling_params,
540544
extra_evaluator_kwargs=dict(apply_chat_template=True))
541545

542546

0 commit comments

Comments
 (0)