diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py index 98fb23e293..584edc437e 100644 --- a/swift/llm/utils/template.py +++ b/swift/llm/utils/template.py @@ -1380,9 +1380,11 @@ def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, An def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]: res = Template.data_collator(self, batch, padding_to) input_features = [b['input_features'] for b in batch if b.get('input_features') is not None] + feature_attention_mask = [ + b['feature_attention_mask'] for b in batch if b.get('feature_attention_mask') is not None + ] if input_features: res['input_features'] = torch.concat(input_features) - feature_attention_mask = [b['feature_attention_mask'] for b in batch] res['feature_attention_mask'] = torch.concat(feature_attention_mask) return res diff --git a/swift/llm/utils/vision_utils.py b/swift/llm/utils/vision_utils.py index 4b09c63b49..bb8f778782 100644 --- a/swift/llm/utils/vision_utils.py +++ b/swift/llm/utils/vision_utils.py @@ -292,7 +292,8 @@ def load_video_qwen2(video_path: str): if nframes is not None: nframes = round_by_factor(nframes, size_factor) else: - fps = FPS + if fps is None: + fps = FPS nframes = video.size(0) / info['video_fps'] * fps nframes = round_by_factor(nframes, size_factor) min_frames = get_env_args('min_frames', int, FPS_MIN_FRAMES)