diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py index a5057e5e850..82c7aca09e0 100644 --- a/examples/models/llava/export_llava.py +++ b/examples/models/llava/export_llava.py @@ -67,6 +67,7 @@ def export(self) -> "LlavaEdgeManager": dynamic_shapes=dynamic_shape, strict=False, ) + # pyre-ignore: Incompatible attribute type [8]: Attribute `pre_autograd_graph_module` declared in class `LLMEdgeManager` has type `Optional[GraphModule]` but is used as type `Module`. self.pre_autograd_graph_module = self.export_program.module() return self diff --git a/examples/models/llava/image_util.py b/examples/models/llava/image_util.py index bf5e331d61c..3f78f0a6ed6 100644 --- a/examples/models/llava/image_util.py +++ b/examples/models/llava/image_util.py @@ -21,6 +21,7 @@ logging.basicConfig(level=logging.INFO, format=FORMAT) +# pyre-ignore: Undefined or invalid type [11]: Annotation `Image` is not defined as a type. def prepare_image(image: Image, target_h: int, target_w: int) -> torch.Tensor: """Read image into a tensor and resize the image so that it fits in a target_h x target_w canvas. diff --git a/examples/models/llava/model.py b/examples/models/llava/model.py index c0f8ecfc354..5e215d1c035 100644 --- a/examples/models/llava/model.py +++ b/examples/models/llava/model.py @@ -21,6 +21,7 @@ from executorch.examples.models.llama.source_transformation.sdpa import ( replace_sdpa_with_custom_op, ) + from executorch.examples.models.llava.image_util import prepare_image from executorch.examples.models.model_base import EagerModelBase from PIL import Image @@ -48,6 +49,7 @@ def __init__( self.use_sdpa_with_kv_cache_op = use_sdpa_with_kv_cache_op self.model_ = llava_model self.image_processor = image_processor + # pyre-ignore: Undefined attribute [16]: `transformers.utils.dummy_pt_objects.LlavaForConditionalGeneration` has no attribute `config`. self.vision_feature_layer = self.model_.config.vision_feature_layer self.vision_feature_select_strategy = ( self.model_.config.vision_feature_select_strategy @@ -76,6 +78,7 @@ def __init__( ) def _translate_state_dict_for_text_model(self) -> Dict[str, Any]: + # pyre-ignore: Undefined attribute [16]: `transformers.utils.dummy_pt_objects.LlavaForConditionalGeneration` has no attribute `language_model`. state_dict = self.model_.language_model.state_dict() key_map = { # fmt: off @@ -128,9 +131,11 @@ def get_model(self): return self.model_.get_model() def embed_tokens(self, tokens: torch.Tensor) -> torch.Tensor: + # pyre-ignore: Undefined attribute [16]: `transformers.utils.dummy_pt_objects.LlavaForConditionalGeneration` has no attribute `language_model`. return self.model_.language_model.model.embed_tokens(tokens) def encode_images(self, images: torch.Tensor) -> torch.Tensor: + # pyre-ignore: Undefined attribute [16]: `transformers.utils.dummy_pt_objects.LlavaForConditionalGeneration` has no attribute `dtype`. images = images.to(dtype=self.model_.dtype) if type(images) is list: image_features = [] @@ -144,15 +149,19 @@ def encode_images(self, images: torch.Tensor) -> torch.Tensor: image_feature = self._feature_select(image_forward_out).to(image.dtype) image_features.append(image_feature) else: + # pyre-ignore: Undefined attribute [16]: `transformers.utils.dummy_pt_objects.LlavaForConditionalGeneration` has no attribute `vision_tower`. image_forward_outs = self.model_.vision_tower( + # pyre-ignore: Undefined attribute [16]: `transformers.utils.dummy_pt_objects.LlavaForConditionalGeneration` has no attribute `device`. images.to(device=self.model_.device, dtype=self.model_.dtype), output_hidden_states=True, ) image_features = self._feature_select(image_forward_outs).to(images.dtype) + # pyre-ignore: Undefined attribute [16]: `transformers.utils.dummy_pt_objects.LlavaForConditionalGeneration` has no attribute `multi_modal_projector`. image_features = self.model_.multi_modal_projector(image_features) return image_features def image_preprocess(self, img: torch.Tensor) -> torch.Tensor: + # pyre-ignore: Undefined attribute [16]: `transformers.utils.dummy_vision_objects.CLIPImageProcessor` has no attribute `crop_size`. target_h = self.image_processor.crop_size["height"] target_w = self.image_processor.crop_size["width"] # pad the image with median rgb value, to make a square @@ -195,10 +204,15 @@ def image_preprocess(self, img: torch.Tensor) -> torch.Tensor: # print(resized.shape) # cropped = F.center_crop(img, output_size=[w, w]) # print(cropped.shape) + # pyre-ignore: Undefined attribute [16]: `transformers.utils.dummy_vision_objects.CLIPImageProcessor` has no attribute `rescale_factor`. scaled = resized * self.image_processor.rescale_factor # print(scaled) normed = F.normalize( - scaled, self.image_processor.image_mean, self.image_processor.image_std + scaled, + # pyre-ignore: Undefined attribute [16]: `transformers.utils.dummy_vision_objects.CLIPImageProcessor` has no attribute `image_mean`. + self.image_processor.image_mean, + # pyre-ignore: Undefined attribute [16]: `transformers.utils.dummy_vision_objects.CLIPImageProcessor` has no attribute `image_std`. + self.image_processor.image_std, ) # print(normed) return normed.unsqueeze(0) @@ -249,7 +263,9 @@ def prefill_ref( ) -> torch.Tensor: """Avoiding the torch.where() call to find placeholder and insert image embedding. Taking 3 inputs instead.""" embeds = self.prefill_embedding(prompt_before_image, images, prompt_after_image) + # pyre-ignore: Undefined attribute [16]: Module `transformers` has no attribute `LlamaForCausalLM`. return LlamaForCausalLM.forward( + # pyre-ignore: Undefined attribute [16]: `transformers.utils.dummy_pt_objects.LlavaForConditionalGeneration` has no attribute `language_model`. self.model_.language_model, inputs_embeds=embeds, return_dict=False, @@ -268,12 +284,16 @@ class LlavaModel(EagerModelBase): def __init__(self, use_sdpa_with_kv_cache_op=True, max_seq_len=768): self.use_sdpa_with_kv_cache_op = use_sdpa_with_kv_cache_op self.max_seq_len = max_seq_len - self.processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf") + self.processor = AutoProcessor.from_pretrained( + "llava-hf/llava-1.5-7b-hf", + revision="a272c74b2481d8aff3aa6fc2c4bf891fe57334fb", # Need this for transformers >= 4.44.2 + ) self.tokenizer = self.processor.tokenizer self.image_processor = self.processor.image_processor self.model = LlavaForConditionalGeneration.from_pretrained( "llava-hf/llava-1.5-7b-hf", device_map="cpu", + revision="a272c74b2481d8aff3aa6fc2c4bf891fe57334fb", # Need this for transformers >= 4.44.2 ) self.image = Image.open( requests.get( diff --git a/examples/models/llava/targets.bzl b/examples/models/llava/targets.bzl new file mode 100644 index 00000000000..5efb099f06f --- /dev/null +++ b/examples/models/llava/targets.bzl @@ -0,0 +1,24 @@ +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "get_oss_build_kwargs", "runtime") + +def define_common_targets(): + runtime.cxx_binary( + name = "main", + srcs = [ + "main.cpp", + ], + compiler_flags = ["-Wno-global-constructors"], + preprocessor_flags = [ + "-DET_USE_THREADPOOL", + ], + deps = [ + "//executorch/examples/models/llava/runner:runner", + "//executorch/extension/evalue_util:print_evalue", + "//executorch/extension/threadpool:cpuinfo_utils", + "//executorch/extension/threadpool:threadpool", + ], + external_deps = [ + "gflags", + "torch-core-cpp", + ], + **get_oss_build_kwargs() + )