""" 🐦 BirdSense Pro - AI Bird Identification - Local: Ollama LLaVA (vision) + Llama3.2 (text/audio) - Cloud: HuggingFace BLIP-2 + Text models NO HARDCODED BIRDS - Pure AI identification """ import gradio as gr import numpy as np import scipy.signal as signal from typing import Tuple, List, Dict, Optional import json import requests import re import urllib.parse import os import traceback from PIL import Image import io import base64 # ================== CONFIG ================== OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434") HF_TOKEN = os.environ.get("HF_TOKEN", "") DEBUG = True def log(msg): if DEBUG: print(f"[BirdSense] {msg}") # ================== CSS ================== CSS = """ .gradio-container { background: linear-gradient(135deg, #f0f4f8 0%, #d9e2ec 100%) !important; font-family: 'Inter', sans-serif !important; } .header { background: linear-gradient(135deg, #1a365d 0%, #2c5282 50%, #3182ce 100%); color: white; padding: 35px 20px; border-radius: 16px; text-align: center; margin-bottom: 16px; box-shadow: 0 10px 30px rgba(26, 54, 93, 0.25); } .header h1 { font-size: 2.2rem; font-weight: 800; margin: 0 0 8px 0; } .header .subtitle { font-size: 1rem; opacity: 0.9; margin-bottom: 10px; } .header .status { display: inline-flex; align-items: center; gap: 6px; background: rgba(255,255,255,0.15); padding: 6px 16px; border-radius: 50px; font-weight: 600; font-size: 0.85rem; } .status-dot { width: 8px; height: 8px; border-radius: 50%; } .status-green { background: #48bb78; } .status-yellow { background: #ecc94b; } .status-red { background: #fc8181; } .info-box { background: linear-gradient(135deg, #ebf4ff 0%, #c3dafe 100%); border: 1px solid #90cdf4; border-radius: 10px; padding: 14px; margin-bottom: 14px; } .info-box h3 { color: #2b6cb0; margin: 0 0 4px 0; font-size: 0.95rem; } .info-box p { color: #4299e1; margin: 0; font-size: 0.85rem; } .bird-card { background: white; border: 1px solid #e2e8f0; border-radius: 14px; padding: 16px; margin: 10px 0; display: flex; gap: 14px; box-shadow: 0 3px 10px rgba(0,0,0,0.04); } .bird-card img { width: 100px; height: 100px; object-fit: cover; border-radius: 10px; flex-shrink: 0; } .bird-info { flex: 1; min-width: 0; } .bird-info h3 { color: #1a202c; margin: 0 0 3px 0; font-size: 1.1rem; font-weight: 700; } .bird-info .scientific { color: #718096; font-style: italic; font-size: 0.8rem; margin-bottom: 8px; } .confidence { display: inline-block; padding: 3px 10px; border-radius: 16px; font-weight: 700; font-size: 0.75rem; } .conf-high { background: #c6f6d5; color: #22543d; } .conf-med { background: #fefcbf; color: #744210; } .conf-low { background: #fed7d7; color: #742a2a; } .reason { color: #4a5568; margin-top: 8px; line-height: 1.5; font-size: 0.85rem; } .error { background: #fff5f5; border: 1px solid #fc8181; border-radius: 10px; padding: 16px; color: #c53030; } .success { background: #f0fff4; border: 1px solid #68d391; border-radius: 10px; padding: 16px; color: #276749; } .processing { background: #ebf8ff; border: 1px solid #63b3ed; border-radius: 10px; padding: 16px; color: #2b6cb0; } .features-box { background: #f7fafc; border: 1px solid #e2e8f0; border-radius: 8px; padding: 12px; margin: 8px 0; font-size: 0.8rem; } """ # ================== OLLAMA FUNCTIONS ================== def check_ollama_models() -> Dict: """Check available Ollama models.""" result = {"available": False, "vision_model": None, "text_model": None} try: response = requests.get(f"{OLLAMA_URL}/api/tags", timeout=3) if response.status_code == 200: models = [m["name"] for m in response.json().get("models", [])] log(f"Ollama models: {models}") result["available"] = True # Find vision model for m in models: if "llava" in m.lower() or "bakllava" in m.lower(): result["vision_model"] = m break # Find text model for m in models: if any(t in m.lower() for t in ["llama", "qwen", "mistral", "phi"]): if "llava" not in m.lower(): # Exclude vision models result["text_model"] = m break except Exception as e: log(f"Ollama check failed: {e}") return result def call_llava(image: Image.Image, prompt: str, model: str) -> str: """Call LLaVA vision model.""" try: # Resize image max_size = 768 if max(image.size) > max_size: ratio = max_size / max(image.size) image = image.resize((int(image.size[0]*ratio), int(image.size[1]*ratio)), Image.Resampling.LANCZOS) # Convert to base64 buffer = io.BytesIO() image.save(buffer, format="JPEG", quality=85) img_b64 = base64.b64encode(buffer.getvalue()).decode() log(f"Calling LLaVA ({model}) with {len(img_b64)} bytes image...") response = requests.post( f"{OLLAMA_URL}/api/generate", json={ "model": model, "prompt": prompt, "images": [img_b64], "stream": False, "options": {"temperature": 0.1, "num_predict": 1200} }, timeout=120 ) if response.status_code == 200: result = response.json().get("response", "") log(f"LLaVA response ({len(result)} chars): {result[:300]}...") return result else: log(f"LLaVA error: {response.status_code} - {response.text[:200]}") except Exception as e: log(f"LLaVA call failed: {traceback.format_exc()}") return "" def call_ollama_text(prompt: str, model: str) -> str: """Call Ollama text model (for audio/description).""" try: log(f"Calling text model ({model})...") response = requests.post( f"{OLLAMA_URL}/api/generate", json={ "model": model, "prompt": prompt, "stream": False, "options": {"temperature": 0.2, "num_predict": 800} }, timeout=60 ) if response.status_code == 200: return response.json().get("response", "") except Exception as e: log(f"Text model error: {e}") return "" # ================== HUGGINGFACE FUNCTIONS ================== def call_hf_image_caption(image: Image.Image) -> str: """Get image caption from HuggingFace BLIP.""" if not HF_TOKEN: log("No HF_TOKEN") return "" headers = {"Authorization": f"Bearer {HF_TOKEN}"} # Resize max_size = 512 if max(image.size) > max_size: ratio = max_size / max(image.size) image = image.resize((int(image.size[0]*ratio), int(image.size[1]*ratio)), Image.Resampling.LANCZOS) buffer = io.BytesIO() image.save(buffer, format="JPEG", quality=80) models = [ "Salesforce/blip-image-captioning-large", "Salesforce/blip-image-captioning-base", ] for model in models: try: log(f"Trying HF caption model: {model}") response = requests.post( f"https://api-inference.huggingface.co/models/{model}", headers=headers, data=buffer.getvalue(), timeout=45 ) if response.status_code == 200: result = response.json() if isinstance(result, list) and result: caption = result[0].get("generated_text", "") if caption: log(f"HF caption: {caption}") return caption elif response.status_code == 503: log(f"{model} loading, trying next...") else: log(f"HF error {response.status_code}: {response.text[:100]}") except Exception as e: log(f"HF caption error: {e}") return "" def call_hf_text(prompt: str) -> str: """Call HuggingFace text model.""" if not HF_TOKEN: return "" headers = {"Authorization": f"Bearer {HF_TOKEN}", "Content-Type": "application/json"} models = [ "mistralai/Mistral-7B-Instruct-v0.2", "HuggingFaceH4/zephyr-7b-beta", "google/flan-t5-xl", ] for model in models: try: log(f"Trying HF text model: {model}") response = requests.post( f"https://api-inference.huggingface.co/models/{model}", headers=headers, json={"inputs": prompt, "parameters": {"max_new_tokens": 600, "temperature": 0.3}}, timeout=45 ) if response.status_code == 200: result = response.json() if isinstance(result, list) and result: text = result[0].get("generated_text", "") if text: log(f"HF text ({len(text)} chars)") return text elif response.status_code == 503: continue except Exception as e: log(f"HF text error: {e}") return "" # ================== PARSING ================== def parse_bird_response(text: str) -> Tuple[List[Dict], str]: """Parse LLM response to extract bird identifications. NO HARDCODED FALLBACKS.""" birds = [] summary = "" if not text: return [], "" log(f"Parsing response: {text[:500]}...") # Try JSON first try: json_match = re.search(r'\{[\s\S]*"birds"[\s\S]*\}', text) if json_match: json_str = json_match.group() json_str = re.sub(r',(\s*[}\]])', r'\1', json_str) # Fix trailing commas data = json.loads(json_str) raw_birds = data.get("birds", []) summary = data.get("summary", "") for b in raw_birds: name = b.get("name", "").strip() # Filter out garbage if name and len(name) > 2 and name.lower() not in ["the bird", "bird", "unknown", "the image", "image"]: birds.append({ "name": name, "scientific_name": b.get("scientific_name", ""), "confidence": min(99, max(1, int(b.get("confidence", 70)))), "reason": b.get("reason", "Identified by AI") }) if birds: return birds, summary except json.JSONDecodeError as e: log(f"JSON parse error: {e}") # Fallback: Extract from text using patterns # Look for "This is a/an [Bird Name]" or "[Bird Name] (Scientific name)" patterns = [ r"(?:this is|identified as|appears to be|looks like|most likely)\s+(?:a|an|the)?\s*([A-Z][a-z]+(?:[-\s][A-Za-z]+){0,3})", r"([A-Z][a-z]+(?:\s[A-Za-z]+)?)\s*\(([A-Z][a-z]+\s[a-z]+)\)", # Name (Scientific name) r"species[:\s]+([A-Z][a-z]+(?:\s[A-Za-z]+)?)", ] for pattern in patterns: matches = re.findall(pattern, text) for match in matches: if isinstance(match, tuple): name = match[0].strip() else: name = match.strip() # Validate it looks like a bird name if name and len(name) > 3 and name.lower() not in ["the bird", "bird", "unknown"]: # Check it's not a common non-bird word skip_words = ["the", "this", "that", "image", "photo", "picture", "bird", "species"] if name.lower() not in skip_words: birds.append({ "name": name, "scientific_name": "", "confidence": 65, "reason": "Extracted from AI analysis" }) break if birds: break return birds[:3], summary # Max 3 birds def get_bird_image(bird_name: str) -> str: """Get bird image from Wikipedia.""" if not bird_name or len(bird_name) < 3: return "" try: # Clean name for Wikipedia clean = bird_name.strip().replace(" ", "_") url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{urllib.parse.quote(clean)}" response = requests.get(url, timeout=5) if response.status_code == 200: data = response.json() if "thumbnail" in data: img_url = data["thumbnail"]["source"] log(f"Got Wikipedia image for {bird_name}") return img_url elif "originalimage" in data: return data["originalimage"]["source"] except Exception as e: log(f"Wikipedia image error: {e}") # Fallback placeholder with bird name return f"https://via.placeholder.com/120x120/4299e1/ffffff?text={urllib.parse.quote(bird_name[:10])}" def format_bird_card(bird: Dict, index: int) -> str: """Format bird as HTML card.""" name = bird.get("name", "Unknown") scientific = bird.get("scientific_name", "") confidence = bird.get("confidence", 50) reason = bird.get("reason", "") img_url = get_bird_image(name) conf_class = "conf-high" if confidence >= 80 else "conf-med" if confidence >= 60 else "conf-low" return f"""
{reason}
The AI response couldn't be parsed. Try a clearer image.
{summary or f"Identified using {method}"}
Try a clearer recording with less background noise.
{features_html}{summary}
Try adding more details.
{summary}
AI Bird Identification • Audio • Image • Description
Upload or record bird calls. Uses text AI to analyze acoustic features.
Upload a photo. Uses LLaVA vision AI to analyze the actual image.
Describe the bird - colors, size, behavior, sounds.