from typing import Dict, List, Any from PIL import Image from io import BytesIO import base64 import torch import csv from transformers import CLIPProcessor, CLIPModel import os class EndpointHandler(): def __init__(self, path=""): self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") self.model.eval() # Load categories from CSV self.categories = self.load_categories_from_csv(os.path.join(path, "categories.csv")) def load_categories_from_csv(self, filepath: str) -> List[str]: categories = [] with open(filepath, newline='', encoding='utf-8') as csvfile: reader = csv.reader(csvfile) for row in reader: if row: categories.append(row[0].strip()) return categories def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: """ Args: data: { "inputs": { "image": base64 string } } Returns: Top 50 categories with highest similarity score. """ inputs = data.get("inputs", data) # Decode and process image image = Image.open(BytesIO(base64.b64decode(inputs["image"]))).convert("RGB") # Process image and text processed = self.processor(text=self.categories, images=image, return_tensors="pt", padding=True) with torch.no_grad(): image_features = self.model.get_image_features(processed["pixel_values"]) text_features = self.model.get_text_features(processed["input_ids"], attention_mask=processed["attention_mask"]) # Normalize features image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True) text_features = text_features / text_features.norm(p=2, dim=-1, keepdim=True) # Compute similarity similarity = (image_features @ text_features.T).squeeze(0) # Prepare result result = [{"label": label, "score": score.item()} for label, score in zip(self.categories, similarity)] result = sorted(result, key=lambda x: x["score"], reverse=True) return result[:50] # Return top 50