DirectAI · isaacrob · Aug 26, 2024 · Aug 21, 2024 · Aug 21, 2024 · Aug 23, 2024
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
 redis_data/appendonlydir/
 redis_data/dump.rdb
-logs/
+logs/
+.cache
diff --git a/directai_fastapi/Dockerfile b/directai_fastapi/Dockerfile
@@ -1,9 +1,11 @@
-FROM pytorch/pytorch:2.2.0-cuda12.1-cudnn8-runtime
+FROM pytorch/pytorch:2.2.0-cuda12.1-cudnn8-devel
 WORKDIR /directai_fastapi
 
 RUN apt-get update
 RUN apt-get install libgl1 libglib2.0-0 libsm6 libxrender1 libxext6 -y
 
+RUN apt-get install git -y
+
 RUN apt-get install cmake build-essential -y
 COPY requirements.txt .
 RUN pip install -r requirements.txt

diff --git a/directai_fastapi/modeling/distributed_backend.py b/directai_fastapi/modeling/distributed_backend.py
@@ -9,22 +9,59 @@
 from typing import List
 from pydantic_models import ClassifierResponse, SingleDetectionResponse
 from modeling.image_classifier import ZeroShotImageClassifierWithFeedback
+from modeling.object_detector import ZeroShotObjectDetectorWithFeedback
 
 
 serve.start(http_options={"port": 8100})
 
 
 @serve.deployment
 class ObjectDetector:
-    async def __call__(self, image: Image.Image) -> List[List[SingleDetectionResponse]]:
-        # Placeholder implementation
-        single_detection = {
-            "tlbr": [0.0, 0.0, 1.0, 1.0],
-            "score": random.random(),
-            "class": "dog",
-        }
-        sdr = SingleDetectionResponse.parse_obj(single_detection)
-        return [[sdr]]
+    def __init__(self) -> None:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.model = ZeroShotObjectDetectorWithFeedback(device=device)
+
+    async def __call__(
+        self,
+        image: bytes,
+        labels: list[str],
+        inc_sub_labels_dict: dict[str, list[str]],
+        exc_sub_labels_dict: dict[str, list[str]] | None = None,
+        label_conf_thres: dict[str, float] | None = None,
+        augment_examples: bool = True,
+        nms_thre: float = 0.4,
+        run_class_agnostic_nms: bool = False,
+    ) -> list[SingleDetectionResponse]:
+        with torch.inference_mode(), torch.autocast(str(self.model.device)):
+            batched_predicted_boxes = self.model(
+                image,
+                labels=labels,
+                inc_sub_labels_dict=inc_sub_labels_dict,
+                exc_sub_labels_dict=exc_sub_labels_dict,
+                label_conf_thres=label_conf_thres,
+                augment_examples=augment_examples,
+                nms_thre=nms_thre,
+                run_class_agnostic_nms=run_class_agnostic_nms,
+            )
+
+            # since we are processing a single image, the output has batch size 1, so we can safely index into it
+            per_label_boxes = batched_predicted_boxes[0]
+
+            # predicted_boxes is a list in order of labels, with each box of the form [x1, y1, x2, y2, confidence]
+            detection_responses = []
+            for label, boxes in zip(labels, per_label_boxes):
+                for detection in boxes:
+                    det_dict = {
+                        "tlbr": detection[:4].tolist(),
+                        "score": detection[4].item(),
+                        "class_": label,
+                    }
+                    single_detection_response = SingleDetectionResponse.parse_obj(
+                        det_dict
+                    )
+                    detection_responses.append(single_detection_response)
+
+            return detection_responses
 
 
 @serve.deployment
@@ -41,8 +78,7 @@ async def __call__(
         exc_sub_labels_dict: dict[str, list[str]] | None = None,
         augment_examples: bool = True,
     ) -> ClassifierResponse:
-
-        with torch.no_grad(), torch.autocast(str(self.model.device)):
+        with torch.inference_mode(), torch.autocast(str(self.model.device)):
             raw_scores = self.model(
                 image,
                 labels=labels,

diff --git a/directai_fastapi/modeling/image_classifier.py b/directai_fastapi/modeling/image_classifier.py
@@ -8,6 +8,7 @@
 from modeling.tensor_utils import (
     batch_encode_cache_missed_list_elements,
     image_bytes_to_tensor,
+    squish_labels,
 )
 from modeling.prompt_templates import noop_hypothesis_formats, many_hypothesis_formats
 from lru import LRU
@@ -134,35 +135,6 @@ def encode_text(self, text: list[str], augment: bool = True) -> torch.Tensor:
                 self.not_augmented_label_encoding_cache,
             )
 
-    def squish_labels(
-        self,
-        labels: list[str],
-        inc_sub_labels_dict: dict[str, list[str]],
-        exc_sub_labels_dict: dict[str, list[str]],
-    ) -> tuple[list[str], dict[str, int]]:
-        # build one list of labels to encode, without duplicates
-        # and lists / dicts containing the indices of each label
-        # and the indices of each label's sub-labels
-        all_labels_to_inds: dict[str, int] = {}
-        all_labels = []
-
-        for label in labels:
-            inc_subs = inc_sub_labels_dict.get(label)
-            if inc_subs is not None:
-                for inc_sub in inc_subs:
-                    if inc_sub not in all_labels_to_inds:
-                        all_labels_to_inds[inc_sub] = len(all_labels_to_inds)
-                        all_labels.append(inc_sub)
-
-            exc_subs = exc_sub_labels_dict.get(label)
-            if exc_subs is not None:
-                for exc_sub in exc_subs:
-                    if exc_sub not in all_labels_to_inds:
-                        all_labels_to_inds[exc_sub] = len(all_labels_to_inds)
-                        all_labels.append(exc_sub)
-
-        return all_labels, all_labels_to_inds
-
     def forward(
         self,
         image: torch.Tensor | bytes,
@@ -189,7 +161,7 @@ def forward(
             label: excs for label, excs in exc_sub_labels_dict.items() if len(excs) > 0
         }
 
-        all_labels, all_labels_to_inds = self.squish_labels(
+        all_labels, all_labels_to_inds = squish_labels(
             labels, inc_sub_labels_dict, exc_sub_labels_dict
         )
         text_features = self.encode_text(all_labels, augment=augment_examples)