Skip to content

Commit 3572617

Browse files
committed
[SW-194177] - Integrate new vllm-PA algo with HQT
Change-Id: I94c9679f0aff7c2f9a86a802da825bfd6d0772ad
1 parent 5e3a679 commit 3572617

File tree

1 file changed

+3
-5
lines changed

1 file changed

+3
-5
lines changed

neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -530,12 +530,10 @@ def forward_measure(self, input, cache, block_indices, block_offset):
530530
measure_output((output_cache), self._mod_extra_config.outputs)
531531
return output_cache
532532

533-
def fetch_from_cache(self, cache, blocks, permutations):
533+
def fetch_from_cache(self, cache, blocks):
534534
quant_cache = self.quant_input(cache)
535-
output_cache = self.orig_fetch_from_cache(quant_cache, blocks, permutations)
536-
for i in range(len(output_cache)):
537-
output_cache[i]=self.quant_output(output_cache[i])
538-
return output_cache
535+
output_cache = self.orig_fetch_from_cache(quant_cache, blocks)
536+
return self.quant_output(output_cache)
539537

540538

541539
class PatchedConv2d(nn.Conv2d):

0 commit comments

Comments
 (0)