diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
index 98fb23e293..584edc437e 100644
--- a/swift/llm/utils/template.py
+++ b/swift/llm/utils/template.py
@@ -1380,9 +1380,11 @@ def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, An
     def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]:
         res = Template.data_collator(self, batch, padding_to)
         input_features = [b['input_features'] for b in batch if b.get('input_features') is not None]
+        feature_attention_mask = [
+            b['feature_attention_mask'] for b in batch if b.get('feature_attention_mask') is not None
+        ]
         if input_features:
             res['input_features'] = torch.concat(input_features)
-            feature_attention_mask = [b['feature_attention_mask'] for b in batch]
             res['feature_attention_mask'] = torch.concat(feature_attention_mask)
         return res
 
diff --git a/swift/llm/utils/vision_utils.py b/swift/llm/utils/vision_utils.py
index 4b09c63b49..bb8f778782 100644
--- a/swift/llm/utils/vision_utils.py
+++ b/swift/llm/utils/vision_utils.py
@@ -292,7 +292,8 @@ def load_video_qwen2(video_path: str):
     if nframes is not None:
         nframes = round_by_factor(nframes, size_factor)
     else:
-        fps = FPS
+        if fps is None:
+            fps = FPS
         nframes = video.size(0) / info['video_fps'] * fps
         nframes = round_by_factor(nframes, size_factor)
         min_frames = get_env_args('min_frames', int, FPS_MIN_FRAMES)