""" 🐦 BirdSense Pro - AI Bird Identification - Local: Ollama LLaVA (vision) + Llama3.2 (text/audio) - Cloud: HuggingFace BLIP-2 + Text models NO HARDCODED BIRDS - Pure AI identification """ import gradio as gr import numpy as np import scipy.signal as signal from typing import Tuple, List, Dict, Optional import json import requests import re import urllib.parse import os import traceback from PIL import Image import io import base64 # ================== CONFIG ================== OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434") HF_TOKEN = os.environ.get("HF_TOKEN", "") DEBUG = True def log(msg): if DEBUG: print(f"[BirdSense] {msg}") # ================== CSS ================== CSS = """ .gradio-container { background: linear-gradient(135deg, #f0f4f8 0%, #d9e2ec 100%) !important; font-family: 'Inter', sans-serif !important; } .header { background: linear-gradient(135deg, #1a365d 0%, #2c5282 50%, #3182ce 100%); color: white; padding: 35px 20px; border-radius: 16px; text-align: center; margin-bottom: 16px; box-shadow: 0 10px 30px rgba(26, 54, 93, 0.25); } .header h1 { font-size: 2.2rem; font-weight: 800; margin: 0 0 8px 0; } .header .subtitle { font-size: 1rem; opacity: 0.9; margin-bottom: 10px; } .header .status { display: inline-flex; align-items: center; gap: 6px; background: rgba(255,255,255,0.15); padding: 6px 16px; border-radius: 50px; font-weight: 600; font-size: 0.85rem; } .status-dot { width: 8px; height: 8px; border-radius: 50%; } .status-green { background: #48bb78; } .status-yellow { background: #ecc94b; } .status-red { background: #fc8181; } .info-box { background: linear-gradient(135deg, #ebf4ff 0%, #c3dafe 100%); border: 1px solid #90cdf4; border-radius: 10px; padding: 14px; margin-bottom: 14px; } .info-box h3 { color: #2b6cb0; margin: 0 0 4px 0; font-size: 0.95rem; } .info-box p { color: #4299e1; margin: 0; font-size: 0.85rem; } .bird-card { background: white; border: 1px solid #e2e8f0; border-radius: 14px; padding: 16px; margin: 10px 0; display: flex; gap: 14px; box-shadow: 0 3px 10px rgba(0,0,0,0.04); } .bird-card img { width: 100px; height: 100px; object-fit: cover; border-radius: 10px; flex-shrink: 0; } .bird-info { flex: 1; min-width: 0; } .bird-info h3 { color: #1a202c; margin: 0 0 3px 0; font-size: 1.1rem; font-weight: 700; } .bird-info .scientific { color: #718096; font-style: italic; font-size: 0.8rem; margin-bottom: 8px; } .confidence { display: inline-block; padding: 3px 10px; border-radius: 16px; font-weight: 700; font-size: 0.75rem; } .conf-high { background: #c6f6d5; color: #22543d; } .conf-med { background: #fefcbf; color: #744210; } .conf-low { background: #fed7d7; color: #742a2a; } .reason { color: #4a5568; margin-top: 8px; line-height: 1.5; font-size: 0.85rem; } .error { background: #fff5f5; border: 1px solid #fc8181; border-radius: 10px; padding: 16px; color: #c53030; } .success { background: #f0fff4; border: 1px solid #68d391; border-radius: 10px; padding: 16px; color: #276749; } .processing { background: #ebf8ff; border: 1px solid #63b3ed; border-radius: 10px; padding: 16px; color: #2b6cb0; } .features-box { background: #f7fafc; border: 1px solid #e2e8f0; border-radius: 8px; padding: 12px; margin: 8px 0; font-size: 0.8rem; } """ # ================== OLLAMA FUNCTIONS ================== def check_ollama_models() -> Dict: """Check available Ollama models.""" result = {"available": False, "vision_model": None, "text_model": None} try: response = requests.get(f"{OLLAMA_URL}/api/tags", timeout=3) if response.status_code == 200: models = [m["name"] for m in response.json().get("models", [])] log(f"Ollama models: {models}") result["available"] = True # Find vision model for m in models: if "llava" in m.lower() or "bakllava" in m.lower(): result["vision_model"] = m break # Find text model for m in models: if any(t in m.lower() for t in ["llama", "qwen", "mistral", "phi"]): if "llava" not in m.lower(): # Exclude vision models result["text_model"] = m break except Exception as e: log(f"Ollama check failed: {e}") return result def call_llava(image: Image.Image, prompt: str, model: str) -> str: """Call LLaVA vision model.""" try: # Resize image max_size = 768 if max(image.size) > max_size: ratio = max_size / max(image.size) image = image.resize((int(image.size[0]*ratio), int(image.size[1]*ratio)), Image.Resampling.LANCZOS) # Convert to base64 buffer = io.BytesIO() image.save(buffer, format="JPEG", quality=85) img_b64 = base64.b64encode(buffer.getvalue()).decode() log(f"Calling LLaVA ({model}) with {len(img_b64)} bytes image...") response = requests.post( f"{OLLAMA_URL}/api/generate", json={ "model": model, "prompt": prompt, "images": [img_b64], "stream": False, "options": {"temperature": 0.1, "num_predict": 1200} }, timeout=120 ) if response.status_code == 200: result = response.json().get("response", "") log(f"LLaVA response ({len(result)} chars): {result[:300]}...") return result else: log(f"LLaVA error: {response.status_code} - {response.text[:200]}") except Exception as e: log(f"LLaVA call failed: {traceback.format_exc()}") return "" def call_ollama_text(prompt: str, model: str) -> str: """Call Ollama text model (for audio/description).""" try: log(f"Calling text model ({model})...") response = requests.post( f"{OLLAMA_URL}/api/generate", json={ "model": model, "prompt": prompt, "stream": False, "options": {"temperature": 0.2, "num_predict": 800} }, timeout=60 ) if response.status_code == 200: return response.json().get("response", "") except Exception as e: log(f"Text model error: {e}") return "" # ================== HUGGINGFACE FUNCTIONS ================== def call_hf_image_caption(image: Image.Image) -> str: """Get image caption from HuggingFace BLIP.""" if not HF_TOKEN: log("No HF_TOKEN") return "" headers = {"Authorization": f"Bearer {HF_TOKEN}"} # Resize max_size = 512 if max(image.size) > max_size: ratio = max_size / max(image.size) image = image.resize((int(image.size[0]*ratio), int(image.size[1]*ratio)), Image.Resampling.LANCZOS) buffer = io.BytesIO() image.save(buffer, format="JPEG", quality=80) models = [ "Salesforce/blip-image-captioning-large", "Salesforce/blip-image-captioning-base", ] for model in models: try: log(f"Trying HF caption model: {model}") response = requests.post( f"https://api-inference.huggingface.co/models/{model}", headers=headers, data=buffer.getvalue(), timeout=45 ) if response.status_code == 200: result = response.json() if isinstance(result, list) and result: caption = result[0].get("generated_text", "") if caption: log(f"HF caption: {caption}") return caption elif response.status_code == 503: log(f"{model} loading, trying next...") else: log(f"HF error {response.status_code}: {response.text[:100]}") except Exception as e: log(f"HF caption error: {e}") return "" def call_hf_text(prompt: str) -> str: """Call HuggingFace text model.""" if not HF_TOKEN: return "" headers = {"Authorization": f"Bearer {HF_TOKEN}", "Content-Type": "application/json"} models = [ "mistralai/Mistral-7B-Instruct-v0.2", "HuggingFaceH4/zephyr-7b-beta", "google/flan-t5-xl", ] for model in models: try: log(f"Trying HF text model: {model}") response = requests.post( f"https://api-inference.huggingface.co/models/{model}", headers=headers, json={"inputs": prompt, "parameters": {"max_new_tokens": 600, "temperature": 0.3}}, timeout=45 ) if response.status_code == 200: result = response.json() if isinstance(result, list) and result: text = result[0].get("generated_text", "") if text: log(f"HF text ({len(text)} chars)") return text elif response.status_code == 503: continue except Exception as e: log(f"HF text error: {e}") return "" # ================== PARSING ================== def parse_bird_response(text: str) -> Tuple[List[Dict], str]: """Parse LLM response to extract bird identifications. NO HARDCODED FALLBACKS.""" birds = [] summary = "" if not text: return [], "" log(f"Parsing response: {text[:500]}...") # Try JSON first try: json_match = re.search(r'\{[\s\S]*"birds"[\s\S]*\}', text) if json_match: json_str = json_match.group() json_str = re.sub(r',(\s*[}\]])', r'\1', json_str) # Fix trailing commas data = json.loads(json_str) raw_birds = data.get("birds", []) summary = data.get("summary", "") for b in raw_birds: name = b.get("name", "").strip() # Filter out garbage if name and len(name) > 2 and name.lower() not in ["the bird", "bird", "unknown", "the image", "image"]: birds.append({ "name": name, "scientific_name": b.get("scientific_name", ""), "confidence": min(99, max(1, int(b.get("confidence", 70)))), "reason": b.get("reason", "Identified by AI") }) if birds: return birds, summary except json.JSONDecodeError as e: log(f"JSON parse error: {e}") # Fallback: Extract from text using patterns # Look for "This is a/an [Bird Name]" or "[Bird Name] (Scientific name)" patterns = [ r"(?:this is|identified as|appears to be|looks like|most likely)\s+(?:a|an|the)?\s*([A-Z][a-z]+(?:[-\s][A-Za-z]+){0,3})", r"([A-Z][a-z]+(?:\s[A-Za-z]+)?)\s*\(([A-Z][a-z]+\s[a-z]+)\)", # Name (Scientific name) r"species[:\s]+([A-Z][a-z]+(?:\s[A-Za-z]+)?)", ] for pattern in patterns: matches = re.findall(pattern, text) for match in matches: if isinstance(match, tuple): name = match[0].strip() else: name = match.strip() # Validate it looks like a bird name if name and len(name) > 3 and name.lower() not in ["the bird", "bird", "unknown"]: # Check it's not a common non-bird word skip_words = ["the", "this", "that", "image", "photo", "picture", "bird", "species"] if name.lower() not in skip_words: birds.append({ "name": name, "scientific_name": "", "confidence": 65, "reason": "Extracted from AI analysis" }) break if birds: break return birds[:3], summary # Max 3 birds def get_bird_image(bird_name: str) -> str: """Get bird image from Wikipedia.""" if not bird_name or len(bird_name) < 3: return "" try: # Clean name for Wikipedia clean = bird_name.strip().replace(" ", "_") url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{urllib.parse.quote(clean)}" response = requests.get(url, timeout=5) if response.status_code == 200: data = response.json() if "thumbnail" in data: img_url = data["thumbnail"]["source"] log(f"Got Wikipedia image for {bird_name}") return img_url elif "originalimage" in data: return data["originalimage"]["source"] except Exception as e: log(f"Wikipedia image error: {e}") # Fallback placeholder with bird name return f"https://via.placeholder.com/120x120/4299e1/ffffff?text={urllib.parse.quote(bird_name[:10])}" def format_bird_card(bird: Dict, index: int) -> str: """Format bird as HTML card.""" name = bird.get("name", "Unknown") scientific = bird.get("scientific_name", "") confidence = bird.get("confidence", 50) reason = bird.get("reason", "") img_url = get_bird_image(name) conf_class = "conf-high" if confidence >= 80 else "conf-med" if confidence >= 60 else "conf-low" return f"""
{name}

{index}. {name}

{f'
{scientific}
' if scientific else ''} {confidence}% confidence

{reason}

""" # ================== IDENTIFICATION FUNCTIONS ================== IMAGE_PROMPT = """Look at this bird image carefully. Identify the bird species. You MUST respond with valid JSON in this exact format: { "birds": [ { "name": "Blue-and-yellow Macaw", "scientific_name": "Ara ararauna", "confidence": 95, "reason": "Large parrot with bright blue wings and yellow underparts, characteristic of this species" } ], "summary": "This is a Blue-and-yellow Macaw, a large South American parrot." } Look for: - Beak shape and color - Body colors and patterns - Size and shape - Any distinctive markings Give the ACTUAL species name (not "bird" or "unknown"). If unsure, give your best guess with lower confidence. Return ONLY the JSON.""" def identify_image_stream(image): """Identify bird from image.""" if image is None: yield '
⚠️ Please upload an image
' return try: if not isinstance(image, Image.Image): image = Image.fromarray(np.array(image)) image = image.convert("RGB") yield '
🔍 Analyzing image...
' models = check_ollama_models() response = "" method = "" # Try LLaVA first (best for images) if models["vision_model"]: yield f'
🦙 Using LLaVA vision model...
' response = call_llava(image, IMAGE_PROMPT, models["vision_model"]) method = "LLaVA Vision" # Fallback to HuggingFace if not response: yield '
☁️ Using HuggingFace AI...
' # Get caption first caption = call_hf_image_caption(image) if caption: yield f'
🔍 Identifying from caption...
AI sees: {caption}
' # Use text model to identify text_prompt = f"""Based on this image description, identify the bird species: "{caption}" Respond with JSON: {{"birds": [{{"name": "Species Name", "scientific_name": "...", "confidence": 80, "reason": "..."}}], "summary": "..."}} Give the ACTUAL bird species name. Return ONLY JSON.""" if models["text_model"]: response = call_ollama_text(text_prompt, models["text_model"]) if not response: response = call_hf_text(text_prompt) method = "HuggingFace BLIP + Text" else: yield '
❌ Could not analyze image. HuggingFace API may be unavailable.
' return # Parse response birds, summary = parse_bird_response(response) if not birds: yield f'''
❌ Could not identify bird species

The AI response couldn't be parsed. Try a clearer image.

Raw AI response:
{response[:500] if response else "No response"}
''' return # Success result = f'''

🐦 {len(birds)} Bird(s) Identified!

{summary or f"Identified using {method}"}

''' for i, bird in enumerate(birds, 1): result += format_bird_card(bird, i) yield result except Exception as e: log(f"Image error: {traceback.format_exc()}") yield f'
❌ Error: {str(e)}
' # ================== AUDIO IDENTIFICATION ================== def process_audio(audio_data: np.ndarray, sr: int) -> Dict: """Extract audio features for bird identification.""" try: audio = audio_data.astype(np.float64) if np.max(np.abs(audio)) > 0: audio = audio / np.max(np.abs(audio)) # Bandpass filter (500Hz - 10kHz for birds) nyq = sr / 2 low, high = max(500/nyq, 0.01), min(10000/nyq, 0.99) if low < high: b, a = signal.butter(4, [low, high], btype='band') audio = signal.filtfilt(b, a, audio) duration = len(audio_data) / sr # Peak frequency fft = np.fft.rfft(audio) freqs = np.fft.rfftfreq(len(audio), 1/sr) peak_freq = freqs[np.argmax(np.abs(fft))] if len(freqs) > 0 else 0 # Count syllables envelope = np.abs(signal.hilbert(audio)) threshold = np.mean(envelope) + 0.5 * np.std(envelope) syllables = np.sum(np.diff((envelope > threshold).astype(int)) > 0) return { "duration": round(duration, 2), "peak_freq": int(peak_freq), "syllables": int(syllables), "freq_range": "high" if peak_freq > 3000 else "medium" if peak_freq > 1000 else "low" } except: return {"duration": 0, "peak_freq": 0, "syllables": 0, "freq_range": "unknown"} AUDIO_PROMPT = """You are an expert ornithologist. Identify the bird from these audio features: - Duration: {duration} seconds - Peak Frequency: {peak_freq} Hz ({freq_range} range) - Syllables/notes detected: {syllables} {extra} Based on these acoustic features, identify possible bird species. High frequency (>3000 Hz) = small birds like warblers, finches Medium frequency (1000-3000 Hz) = thrushes, bulbuls, mynas Low frequency (<1000 Hz) = larger birds like crows, doves Respond with JSON ONLY: {{"birds": [{{"name": "Species Name", "scientific_name": "...", "confidence": 70, "reason": "Matches because..."}}], "summary": "..."}} Give ACTUAL species names, not generic terms.""" def identify_audio_stream(audio_input, location: str = "", month: str = ""): """Identify bird from audio - uses TEXT model, not vision.""" if audio_input is None: yield '
⚠️ Please upload or record audio
' return try: if isinstance(audio_input, tuple): sr, audio_data = audio_input else: yield '
⚠️ Invalid audio format
' return if len(audio_data) == 0: yield '
⚠️ Empty audio
' return if len(audio_data.shape) > 1: audio_data = np.mean(audio_data, axis=1) yield '
🔊 Analyzing audio features...
' features = process_audio(audio_data, sr) features_html = f'''
🎵 Audio Analysis
• Duration: {features["duration"]}s | Peak: {features["peak_freq"]} Hz ({features["freq_range"]})
• Syllables: {features["syllables"]}
''' yield f'
🤖 Identifying bird...
{features_html}' extra = "" if location: extra += f"\n- Location: {location}" if month: extra += f"\n- Month: {month}" prompt = AUDIO_PROMPT.format(**features, extra=extra) models = check_ollama_models() response = "" # Use TEXT model for audio (NOT vision!) if models["text_model"]: yield f'
🦙 Using {models["text_model"]}...
{features_html}' response = call_ollama_text(prompt, models["text_model"]) if not response: yield f'
☁️ Using HuggingFace...
{features_html}' response = call_hf_text(prompt) birds, summary = parse_bird_response(response) if not birds: yield f'''
Could not identify bird from audio

Try a clearer recording with less background noise.

{features_html}
''' return result = f'''

🐦 {len(birds)} Bird(s) Identified!

{summary}

{features_html}''' for i, bird in enumerate(birds, 1): result += format_bird_card(bird, i) yield result except Exception as e: log(f"Audio error: {traceback.format_exc()}") yield f'
❌ Error: {str(e)}
' # ================== DESCRIPTION IDENTIFICATION ================== def identify_description_stream(description: str): """Identify bird from text description.""" if not description or len(description.strip()) < 5: yield '
⚠️ Please enter a description
' return try: yield '
🔍 Analyzing description...
' prompt = f"""Identify the bird species from this description: "{description}" Respond with JSON: {{"birds": [{{"name": "Species Name", "scientific_name": "...", "confidence": 80, "reason": "..."}}], "summary": "..."}} Use ACTUAL species names. Return ONLY JSON.""" models = check_ollama_models() response = "" if models["text_model"]: yield '
🦙 Using local AI...
' response = call_ollama_text(prompt, models["text_model"]) if not response: yield '
☁️ Using HuggingFace...
' response = call_hf_text(prompt) birds, summary = parse_bird_response(response) if not birds: yield '
Could not identify bird

Try adding more details.

' return result = f'''

🐦 {len(birds)} Bird(s) Match!

{summary}

''' for i, bird in enumerate(birds, 1): result += format_bird_card(bird, i) yield result except Exception as e: yield f'
❌ Error: {str(e)}
' # ================== UI ================== def get_status_html(): """Get status indicator.""" models = check_ollama_models() if models["vision_model"]: return f' LLaVA + {models["text_model"] or "HF"}' elif models["text_model"]: return f' {models["text_model"]} (no vision)' elif HF_TOKEN: return ' HuggingFace Cloud' else: return ' Limited Mode' def create_app(): with gr.Blocks(title="BirdSense Pro") as demo: gr.HTML(f"") gr.HTML(f"""

🐦 BirdSense Pro

AI Bird Identification • Audio • Image • Description

{get_status_html()}
""") # AUDIO FIRST with gr.Tab("🎵 Audio"): gr.HTML('

🎵 Audio Identification

Upload or record bird calls. Uses text AI to analyze acoustic features.

') with gr.Row(): with gr.Column(): audio_in = gr.Audio(sources=["upload", "microphone"], type="numpy", label="🎤 Audio") with gr.Row(): loc = gr.Textbox(label="📍 Location", placeholder="e.g., Mumbai") mon = gr.Dropdown(label="📅 Month", choices=[""] + ["January","February","March","April","May","June","July","August","September","October","November","December"]) audio_btn = gr.Button("🔍 Identify", variant="primary", size="lg") with gr.Column(): audio_out = gr.HTML('
🎵 Upload audio to identify
') audio_btn.click(identify_audio_stream, [audio_in, loc, mon], audio_out) # IMAGE with gr.Tab("📷 Image"): gr.HTML('

📷 Image Identification

Upload a photo. Uses LLaVA vision AI to analyze the actual image.

') with gr.Row(): with gr.Column(): img_in = gr.Image(sources=["upload", "webcam"], type="pil", label="📸 Photo") img_btn = gr.Button("🔍 Identify", variant="primary", size="lg") with gr.Column(): img_out = gr.HTML('
📷 Upload image to identify
') img_btn.click(identify_image_stream, [img_in], img_out) # DESCRIPTION with gr.Tab("📝 Description"): gr.HTML('

📝 Text Description

Describe the bird - colors, size, behavior, sounds.

') with gr.Row(): with gr.Column(): desc_in = gr.Textbox(label="✍️ Description", lines=3, placeholder="e.g., Large blue and yellow parrot with long tail") desc_btn = gr.Button("🔍 Identify", variant="primary", size="lg") with gr.Column(): desc_out = gr.HTML('
📝 Describe a bird
') desc_btn.click(identify_description_stream, [desc_in], desc_out) gr.HTML('
BirdSense Pro • Local: LLaVA (image) + Llama3.2 (audio/text) • Cloud: HuggingFace BLIP
') return demo if __name__ == "__main__": log("Starting BirdSense Pro...") models = check_ollama_models() log(f"Vision: {models['vision_model']}, Text: {models['text_model']}, HF: {bool(HF_TOKEN)}") app = create_app() app.launch(server_name="0.0.0.0", server_port=7860, show_error=True)