[SW-194177] - Integrate new vllm-PA algo with HQT

nirda7 · nirda7 · commit 357261786548 · 2024-07-25T10:32:50.000+03:00
Change-Id: I94c9679f0aff7c2f9a86a802da825bfd6d0772ad
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
@@ -530,12 +530,10 @@ def forward_measure(self, input, cache, block_indices, block_offset):
         measure_output((output_cache), self._mod_extra_config.outputs)
         return output_cache
 
-    def fetch_from_cache(self, cache, blocks, permutations):
+    def fetch_from_cache(self, cache, blocks):
         quant_cache = self.quant_input(cache)
-        output_cache = self.orig_fetch_from_cache(quant_cache, blocks, permutations)
-        for i in range(len(output_cache)):
-            output_cache[i]=self.quant_output(output_cache[i])
-        return output_cache
+        output_cache = self.orig_fetch_from_cache(quant_cache, blocks)
+        return self.quant_output(output_cache)
 
 
 class PatchedConv2d(nn.Conv2d):