manevamarija
/

clip-vit-base-custom-handler

@@ -3,47 +3,61 @@ from PIL import Image
 from io import BytesIO
 import base64
 import torch
 from transformers import CLIPProcessor, CLIPModel
 class EndpointHandler():
     def __init__(self, path=""):
         self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
         self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
         self.model.eval()
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
         """
         Args:
             data: {
                 "inputs": {
-                    "image": base64 string,
-                    "candiates": list of strings
                 }
             }
         Returns:
-            List of dicts with raw cosine similarity scores (not softmax probabilities).
         """
         inputs = data.get("inputs", data)
         # Decode and process image
         image = Image.open(BytesIO(base64.b64decode(inputs["image"]))).convert("RGB")
-        categories = inputs["candiates"]
-        # Get image and text features
-        processed = self.processor(text=categories, images=image, return_tensors="pt", padding=True)
         with torch.no_grad():
             image_features = self.model.get_image_features(processed["pixel_values"])
             text_features = self.model.get_text_features(processed["input_ids"], attention_mask=processed["attention_mask"])
-        # Normalize (L2) to get cosine similarity
         image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
         text_features = text_features / text_features.norm(p=2, dim=-1, keepdim=True)
-        similarity = (image_features @ text_features.T).squeeze(0)  # shape: (num_labels,)
-        # Format output with raw cosine scores
-        result = [{"label": label, "score": score.item()} for label, score in zip(categories, similarity)]
         result = sorted(result, key=lambda x: x["score"], reverse=True)
-        return result

 from io import BytesIO
 import base64
 import torch
+import csv
 from transformers import CLIPProcessor, CLIPModel
 class EndpointHandler():
     def __init__(self, path=""):
         self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
         self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
         self.model.eval()
+        # Load categories from CSV
+        self.categories = self.load_categories_from_csv("categories.csv")
+    def load_categories_from_csv(self, filepath: str) -> List[str]:
+        categories = []
+        with open(filepath, newline='', encoding='utf-8') as csvfile:
+            reader = csv.reader(csvfile)
+            for row in reader:
+                if row:
+                    categories.append(row[0].strip())
+        return categories
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
         """
         Args:
             data: {
                 "inputs": {
+                    "image": base64 string
                 }
             }
         Returns:
+            Top 20 categories with highest similarity score.
         """
         inputs = data.get("inputs", data)
         # Decode and process image
         image = Image.open(BytesIO(base64.b64decode(inputs["image"]))).convert("RGB")
+        # Process image and text
+        processed = self.processor(text=self.categories, images=image, return_tensors="pt", padding=True)
         with torch.no_grad():
             image_features = self.model.get_image_features(processed["pixel_values"])
             text_features = self.model.get_text_features(processed["input_ids"], attention_mask=processed["attention_mask"])
+        # Normalize features
         image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
         text_features = text_features / text_features.norm(p=2, dim=-1, keepdim=True)
+        # Compute similarity
+        similarity = (image_features @ text_features.T).squeeze(0)
+        # Prepare result
+        result = [{"label": label, "score": score.item()} for label, score in zip(self.categories, similarity)]
         result = sorted(result, key=lambda x: x["score"], reverse=True)
+        return result[:20]  # Return top 20