Spaces:
Running
Running
| """ | |
| π¦ BirdSense Pro - AI Bird Identification | |
| - Local: Ollama LLaVA (vision) + Llama3.2 (text/audio) | |
| - Cloud: HuggingFace BLIP-2 + Text models | |
| NO HARDCODED BIRDS - Pure AI identification | |
| """ | |
| import gradio as gr | |
| import numpy as np | |
| import scipy.signal as signal | |
| from typing import Tuple, List, Dict, Optional | |
| import json | |
| import requests | |
| import re | |
| import urllib.parse | |
| import os | |
| import traceback | |
| from PIL import Image | |
| import io | |
| import base64 | |
| # ================== CONFIG ================== | |
| OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434") | |
| HF_TOKEN = os.environ.get("HF_TOKEN", "") | |
| DEBUG = True | |
| def log(msg): | |
| if DEBUG: | |
| print(f"[BirdSense] {msg}") | |
| # ================== CSS ================== | |
| CSS = """ | |
| .gradio-container { | |
| background: linear-gradient(135deg, #f0f4f8 0%, #d9e2ec 100%) !important; | |
| font-family: 'Inter', sans-serif !important; | |
| } | |
| .header { | |
| background: linear-gradient(135deg, #1a365d 0%, #2c5282 50%, #3182ce 100%); | |
| color: white; padding: 35px 20px; border-radius: 16px; | |
| text-align: center; margin-bottom: 16px; | |
| box-shadow: 0 10px 30px rgba(26, 54, 93, 0.25); | |
| } | |
| .header h1 { font-size: 2.2rem; font-weight: 800; margin: 0 0 8px 0; } | |
| .header .subtitle { font-size: 1rem; opacity: 0.9; margin-bottom: 10px; } | |
| .header .status { | |
| display: inline-flex; align-items: center; gap: 6px; | |
| background: rgba(255,255,255,0.15); padding: 6px 16px; border-radius: 50px; | |
| font-weight: 600; font-size: 0.85rem; | |
| } | |
| .status-dot { width: 8px; height: 8px; border-radius: 50%; } | |
| .status-green { background: #48bb78; } | |
| .status-yellow { background: #ecc94b; } | |
| .status-red { background: #fc8181; } | |
| .info-box { | |
| background: linear-gradient(135deg, #ebf4ff 0%, #c3dafe 100%); | |
| border: 1px solid #90cdf4; border-radius: 10px; padding: 14px; margin-bottom: 14px; | |
| } | |
| .info-box h3 { color: #2b6cb0; margin: 0 0 4px 0; font-size: 0.95rem; } | |
| .info-box p { color: #4299e1; margin: 0; font-size: 0.85rem; } | |
| .bird-card { | |
| background: white; border: 1px solid #e2e8f0; border-radius: 14px; | |
| padding: 16px; margin: 10px 0; display: flex; gap: 14px; | |
| box-shadow: 0 3px 10px rgba(0,0,0,0.04); | |
| } | |
| .bird-card img { width: 100px; height: 100px; object-fit: cover; border-radius: 10px; flex-shrink: 0; } | |
| .bird-info { flex: 1; min-width: 0; } | |
| .bird-info h3 { color: #1a202c; margin: 0 0 3px 0; font-size: 1.1rem; font-weight: 700; } | |
| .bird-info .scientific { color: #718096; font-style: italic; font-size: 0.8rem; margin-bottom: 8px; } | |
| .confidence { display: inline-block; padding: 3px 10px; border-radius: 16px; font-weight: 700; font-size: 0.75rem; } | |
| .conf-high { background: #c6f6d5; color: #22543d; } | |
| .conf-med { background: #fefcbf; color: #744210; } | |
| .conf-low { background: #fed7d7; color: #742a2a; } | |
| .reason { color: #4a5568; margin-top: 8px; line-height: 1.5; font-size: 0.85rem; } | |
| .error { background: #fff5f5; border: 1px solid #fc8181; border-radius: 10px; padding: 16px; color: #c53030; } | |
| .success { background: #f0fff4; border: 1px solid #68d391; border-radius: 10px; padding: 16px; color: #276749; } | |
| .processing { background: #ebf8ff; border: 1px solid #63b3ed; border-radius: 10px; padding: 16px; color: #2b6cb0; } | |
| .features-box { background: #f7fafc; border: 1px solid #e2e8f0; border-radius: 8px; padding: 12px; margin: 8px 0; font-size: 0.8rem; } | |
| """ | |
| # ================== OLLAMA FUNCTIONS ================== | |
| def check_ollama_models() -> Dict: | |
| """Check available Ollama models.""" | |
| result = {"available": False, "vision_model": None, "text_model": None} | |
| try: | |
| response = requests.get(f"{OLLAMA_URL}/api/tags", timeout=3) | |
| if response.status_code == 200: | |
| models = [m["name"] for m in response.json().get("models", [])] | |
| log(f"Ollama models: {models}") | |
| result["available"] = True | |
| # Find vision model | |
| for m in models: | |
| if "llava" in m.lower() or "bakllava" in m.lower(): | |
| result["vision_model"] = m | |
| break | |
| # Find text model | |
| for m in models: | |
| if any(t in m.lower() for t in ["llama", "qwen", "mistral", "phi"]): | |
| if "llava" not in m.lower(): # Exclude vision models | |
| result["text_model"] = m | |
| break | |
| except Exception as e: | |
| log(f"Ollama check failed: {e}") | |
| return result | |
| def call_llava(image: Image.Image, prompt: str, model: str) -> str: | |
| """Call LLaVA vision model.""" | |
| try: | |
| # Resize image | |
| max_size = 768 | |
| if max(image.size) > max_size: | |
| ratio = max_size / max(image.size) | |
| image = image.resize((int(image.size[0]*ratio), int(image.size[1]*ratio)), Image.Resampling.LANCZOS) | |
| # Convert to base64 | |
| buffer = io.BytesIO() | |
| image.save(buffer, format="JPEG", quality=85) | |
| img_b64 = base64.b64encode(buffer.getvalue()).decode() | |
| log(f"Calling LLaVA ({model}) with {len(img_b64)} bytes image...") | |
| response = requests.post( | |
| f"{OLLAMA_URL}/api/generate", | |
| json={ | |
| "model": model, | |
| "prompt": prompt, | |
| "images": [img_b64], | |
| "stream": False, | |
| "options": {"temperature": 0.1, "num_predict": 1200} | |
| }, | |
| timeout=120 | |
| ) | |
| if response.status_code == 200: | |
| result = response.json().get("response", "") | |
| log(f"LLaVA response ({len(result)} chars): {result[:300]}...") | |
| return result | |
| else: | |
| log(f"LLaVA error: {response.status_code} - {response.text[:200]}") | |
| except Exception as e: | |
| log(f"LLaVA call failed: {traceback.format_exc()}") | |
| return "" | |
| def call_ollama_text(prompt: str, model: str) -> str: | |
| """Call Ollama text model (for audio/description).""" | |
| try: | |
| log(f"Calling text model ({model})...") | |
| response = requests.post( | |
| f"{OLLAMA_URL}/api/generate", | |
| json={ | |
| "model": model, | |
| "prompt": prompt, | |
| "stream": False, | |
| "options": {"temperature": 0.2, "num_predict": 800} | |
| }, | |
| timeout=60 | |
| ) | |
| if response.status_code == 200: | |
| return response.json().get("response", "") | |
| except Exception as e: | |
| log(f"Text model error: {e}") | |
| return "" | |
| # ================== HUGGINGFACE FUNCTIONS ================== | |
| def call_hf_image_caption(image: Image.Image) -> str: | |
| """Get image caption from HuggingFace BLIP.""" | |
| if not HF_TOKEN: | |
| log("No HF_TOKEN") | |
| return "" | |
| headers = {"Authorization": f"Bearer {HF_TOKEN}"} | |
| # Resize | |
| max_size = 512 | |
| if max(image.size) > max_size: | |
| ratio = max_size / max(image.size) | |
| image = image.resize((int(image.size[0]*ratio), int(image.size[1]*ratio)), Image.Resampling.LANCZOS) | |
| buffer = io.BytesIO() | |
| image.save(buffer, format="JPEG", quality=80) | |
| models = [ | |
| "Salesforce/blip-image-captioning-large", | |
| "Salesforce/blip-image-captioning-base", | |
| ] | |
| for model in models: | |
| try: | |
| log(f"Trying HF caption model: {model}") | |
| response = requests.post( | |
| f"https://api-inference.huggingface.co/models/{model}", | |
| headers=headers, | |
| data=buffer.getvalue(), | |
| timeout=45 | |
| ) | |
| if response.status_code == 200: | |
| result = response.json() | |
| if isinstance(result, list) and result: | |
| caption = result[0].get("generated_text", "") | |
| if caption: | |
| log(f"HF caption: {caption}") | |
| return caption | |
| elif response.status_code == 503: | |
| log(f"{model} loading, trying next...") | |
| else: | |
| log(f"HF error {response.status_code}: {response.text[:100]}") | |
| except Exception as e: | |
| log(f"HF caption error: {e}") | |
| return "" | |
| def call_hf_text(prompt: str) -> str: | |
| """Call HuggingFace text model.""" | |
| if not HF_TOKEN: | |
| return "" | |
| headers = {"Authorization": f"Bearer {HF_TOKEN}", "Content-Type": "application/json"} | |
| models = [ | |
| "mistralai/Mistral-7B-Instruct-v0.2", | |
| "HuggingFaceH4/zephyr-7b-beta", | |
| "google/flan-t5-xl", | |
| ] | |
| for model in models: | |
| try: | |
| log(f"Trying HF text model: {model}") | |
| response = requests.post( | |
| f"https://api-inference.huggingface.co/models/{model}", | |
| headers=headers, | |
| json={"inputs": prompt, "parameters": {"max_new_tokens": 600, "temperature": 0.3}}, | |
| timeout=45 | |
| ) | |
| if response.status_code == 200: | |
| result = response.json() | |
| if isinstance(result, list) and result: | |
| text = result[0].get("generated_text", "") | |
| if text: | |
| log(f"HF text ({len(text)} chars)") | |
| return text | |
| elif response.status_code == 503: | |
| continue | |
| except Exception as e: | |
| log(f"HF text error: {e}") | |
| return "" | |
| # ================== PARSING ================== | |
| def parse_bird_response(text: str) -> Tuple[List[Dict], str]: | |
| """Parse LLM response to extract bird identifications. NO HARDCODED FALLBACKS.""" | |
| birds = [] | |
| summary = "" | |
| if not text: | |
| return [], "" | |
| log(f"Parsing response: {text[:500]}...") | |
| # Try JSON first | |
| try: | |
| json_match = re.search(r'\{[\s\S]*"birds"[\s\S]*\}', text) | |
| if json_match: | |
| json_str = json_match.group() | |
| json_str = re.sub(r',(\s*[}\]])', r'\1', json_str) # Fix trailing commas | |
| data = json.loads(json_str) | |
| raw_birds = data.get("birds", []) | |
| summary = data.get("summary", "") | |
| for b in raw_birds: | |
| name = b.get("name", "").strip() | |
| # Filter out garbage | |
| if name and len(name) > 2 and name.lower() not in ["the bird", "bird", "unknown", "the image", "image"]: | |
| birds.append({ | |
| "name": name, | |
| "scientific_name": b.get("scientific_name", ""), | |
| "confidence": min(99, max(1, int(b.get("confidence", 70)))), | |
| "reason": b.get("reason", "Identified by AI") | |
| }) | |
| if birds: | |
| return birds, summary | |
| except json.JSONDecodeError as e: | |
| log(f"JSON parse error: {e}") | |
| # Fallback: Extract from text using patterns | |
| # Look for "This is a/an [Bird Name]" or "[Bird Name] (Scientific name)" | |
| patterns = [ | |
| r"(?:this is|identified as|appears to be|looks like|most likely)\s+(?:a|an|the)?\s*([A-Z][a-z]+(?:[-\s][A-Za-z]+){0,3})", | |
| r"([A-Z][a-z]+(?:\s[A-Za-z]+)?)\s*\(([A-Z][a-z]+\s[a-z]+)\)", # Name (Scientific name) | |
| r"species[:\s]+([A-Z][a-z]+(?:\s[A-Za-z]+)?)", | |
| ] | |
| for pattern in patterns: | |
| matches = re.findall(pattern, text) | |
| for match in matches: | |
| if isinstance(match, tuple): | |
| name = match[0].strip() | |
| else: | |
| name = match.strip() | |
| # Validate it looks like a bird name | |
| if name and len(name) > 3 and name.lower() not in ["the bird", "bird", "unknown"]: | |
| # Check it's not a common non-bird word | |
| skip_words = ["the", "this", "that", "image", "photo", "picture", "bird", "species"] | |
| if name.lower() not in skip_words: | |
| birds.append({ | |
| "name": name, | |
| "scientific_name": "", | |
| "confidence": 65, | |
| "reason": "Extracted from AI analysis" | |
| }) | |
| break | |
| if birds: | |
| break | |
| return birds[:3], summary # Max 3 birds | |
| def get_bird_image(bird_name: str) -> str: | |
| """Get bird image from Wikipedia.""" | |
| if not bird_name or len(bird_name) < 3: | |
| return "" | |
| try: | |
| # Clean name for Wikipedia | |
| clean = bird_name.strip().replace(" ", "_") | |
| url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{urllib.parse.quote(clean)}" | |
| response = requests.get(url, timeout=5) | |
| if response.status_code == 200: | |
| data = response.json() | |
| if "thumbnail" in data: | |
| img_url = data["thumbnail"]["source"] | |
| log(f"Got Wikipedia image for {bird_name}") | |
| return img_url | |
| elif "originalimage" in data: | |
| return data["originalimage"]["source"] | |
| except Exception as e: | |
| log(f"Wikipedia image error: {e}") | |
| # Fallback placeholder with bird name | |
| return f"https://via.placeholder.com/120x120/4299e1/ffffff?text={urllib.parse.quote(bird_name[:10])}" | |
| def format_bird_card(bird: Dict, index: int) -> str: | |
| """Format bird as HTML card.""" | |
| name = bird.get("name", "Unknown") | |
| scientific = bird.get("scientific_name", "") | |
| confidence = bird.get("confidence", 50) | |
| reason = bird.get("reason", "") | |
| img_url = get_bird_image(name) | |
| conf_class = "conf-high" if confidence >= 80 else "conf-med" if confidence >= 60 else "conf-low" | |
| return f""" | |
| <div class="bird-card"> | |
| <img src="{img_url}" alt="{name}" onerror="this.style.display='none'"> | |
| <div class="bird-info"> | |
| <h3>{index}. {name}</h3> | |
| {f'<div class="scientific">{scientific}</div>' if scientific else ''} | |
| <span class="confidence {conf_class}">{confidence}% confidence</span> | |
| <p class="reason">{reason}</p> | |
| </div> | |
| </div>""" | |
| # ================== IDENTIFICATION FUNCTIONS ================== | |
| IMAGE_PROMPT = """Look at this bird image carefully. Identify the bird species. | |
| You MUST respond with valid JSON in this exact format: | |
| { | |
| "birds": [ | |
| { | |
| "name": "Blue-and-yellow Macaw", | |
| "scientific_name": "Ara ararauna", | |
| "confidence": 95, | |
| "reason": "Large parrot with bright blue wings and yellow underparts, characteristic of this species" | |
| } | |
| ], | |
| "summary": "This is a Blue-and-yellow Macaw, a large South American parrot." | |
| } | |
| Look for: | |
| - Beak shape and color | |
| - Body colors and patterns | |
| - Size and shape | |
| - Any distinctive markings | |
| Give the ACTUAL species name (not "bird" or "unknown"). If unsure, give your best guess with lower confidence. | |
| Return ONLY the JSON.""" | |
| def identify_image_stream(image): | |
| """Identify bird from image.""" | |
| if image is None: | |
| yield '<div class="error">β οΈ Please upload an image</div>' | |
| return | |
| try: | |
| if not isinstance(image, Image.Image): | |
| image = Image.fromarray(np.array(image)) | |
| image = image.convert("RGB") | |
| yield '<div class="processing">π Analyzing image...</div>' | |
| models = check_ollama_models() | |
| response = "" | |
| method = "" | |
| # Try LLaVA first (best for images) | |
| if models["vision_model"]: | |
| yield f'<div class="processing">π¦ Using LLaVA vision model...</div>' | |
| response = call_llava(image, IMAGE_PROMPT, models["vision_model"]) | |
| method = "LLaVA Vision" | |
| # Fallback to HuggingFace | |
| if not response: | |
| yield '<div class="processing">βοΈ Using HuggingFace AI...</div>' | |
| # Get caption first | |
| caption = call_hf_image_caption(image) | |
| if caption: | |
| yield f'<div class="processing">π Identifying from caption...</div><div class="features-box"><b>AI sees:</b> {caption}</div>' | |
| # Use text model to identify | |
| text_prompt = f"""Based on this image description, identify the bird species: | |
| "{caption}" | |
| Respond with JSON: | |
| {{"birds": [{{"name": "Species Name", "scientific_name": "...", "confidence": 80, "reason": "..."}}], "summary": "..."}} | |
| Give the ACTUAL bird species name. Return ONLY JSON.""" | |
| if models["text_model"]: | |
| response = call_ollama_text(text_prompt, models["text_model"]) | |
| if not response: | |
| response = call_hf_text(text_prompt) | |
| method = "HuggingFace BLIP + Text" | |
| else: | |
| yield '<div class="error">β Could not analyze image. HuggingFace API may be unavailable.</div>' | |
| return | |
| # Parse response | |
| birds, summary = parse_bird_response(response) | |
| if not birds: | |
| yield f'''<div class="error"> | |
| <b>β Could not identify bird species</b> | |
| <p>The AI response couldn't be parsed. Try a clearer image.</p> | |
| <div class="features-box"><b>Raw AI response:</b><br>{response[:500] if response else "No response"}</div> | |
| </div>''' | |
| return | |
| # Success | |
| result = f'''<div class="success"> | |
| <h3>π¦ {len(birds)} Bird(s) Identified!</h3> | |
| <p>{summary or f"Identified using {method}"}</p> | |
| </div>''' | |
| for i, bird in enumerate(birds, 1): | |
| result += format_bird_card(bird, i) | |
| yield result | |
| except Exception as e: | |
| log(f"Image error: {traceback.format_exc()}") | |
| yield f'<div class="error">β Error: {str(e)}</div>' | |
| # ================== AUDIO IDENTIFICATION ================== | |
| def process_audio(audio_data: np.ndarray, sr: int) -> Dict: | |
| """Extract audio features for bird identification.""" | |
| try: | |
| audio = audio_data.astype(np.float64) | |
| if np.max(np.abs(audio)) > 0: | |
| audio = audio / np.max(np.abs(audio)) | |
| # Bandpass filter (500Hz - 10kHz for birds) | |
| nyq = sr / 2 | |
| low, high = max(500/nyq, 0.01), min(10000/nyq, 0.99) | |
| if low < high: | |
| b, a = signal.butter(4, [low, high], btype='band') | |
| audio = signal.filtfilt(b, a, audio) | |
| duration = len(audio_data) / sr | |
| # Peak frequency | |
| fft = np.fft.rfft(audio) | |
| freqs = np.fft.rfftfreq(len(audio), 1/sr) | |
| peak_freq = freqs[np.argmax(np.abs(fft))] if len(freqs) > 0 else 0 | |
| # Count syllables | |
| envelope = np.abs(signal.hilbert(audio)) | |
| threshold = np.mean(envelope) + 0.5 * np.std(envelope) | |
| syllables = np.sum(np.diff((envelope > threshold).astype(int)) > 0) | |
| return { | |
| "duration": round(duration, 2), | |
| "peak_freq": int(peak_freq), | |
| "syllables": int(syllables), | |
| "freq_range": "high" if peak_freq > 3000 else "medium" if peak_freq > 1000 else "low" | |
| } | |
| except: | |
| return {"duration": 0, "peak_freq": 0, "syllables": 0, "freq_range": "unknown"} | |
| AUDIO_PROMPT = """You are an expert ornithologist. Identify the bird from these audio features: | |
| - Duration: {duration} seconds | |
| - Peak Frequency: {peak_freq} Hz ({freq_range} range) | |
| - Syllables/notes detected: {syllables} | |
| {extra} | |
| Based on these acoustic features, identify possible bird species. | |
| High frequency (>3000 Hz) = small birds like warblers, finches | |
| Medium frequency (1000-3000 Hz) = thrushes, bulbuls, mynas | |
| Low frequency (<1000 Hz) = larger birds like crows, doves | |
| Respond with JSON ONLY: | |
| {{"birds": [{{"name": "Species Name", "scientific_name": "...", "confidence": 70, "reason": "Matches because..."}}], "summary": "..."}} | |
| Give ACTUAL species names, not generic terms.""" | |
| def identify_audio_stream(audio_input, location: str = "", month: str = ""): | |
| """Identify bird from audio - uses TEXT model, not vision.""" | |
| if audio_input is None: | |
| yield '<div class="error">β οΈ Please upload or record audio</div>' | |
| return | |
| try: | |
| if isinstance(audio_input, tuple): | |
| sr, audio_data = audio_input | |
| else: | |
| yield '<div class="error">β οΈ Invalid audio format</div>' | |
| return | |
| if len(audio_data) == 0: | |
| yield '<div class="error">β οΈ Empty audio</div>' | |
| return | |
| if len(audio_data.shape) > 1: | |
| audio_data = np.mean(audio_data, axis=1) | |
| yield '<div class="processing">π Analyzing audio features...</div>' | |
| features = process_audio(audio_data, sr) | |
| features_html = f'''<div class="features-box"> | |
| <b>π΅ Audio Analysis</b><br> | |
| β’ Duration: {features["duration"]}s | Peak: {features["peak_freq"]} Hz ({features["freq_range"]})<br> | |
| β’ Syllables: {features["syllables"]} | |
| </div>''' | |
| yield f'<div class="processing">π€ Identifying bird...</div>{features_html}' | |
| extra = "" | |
| if location: extra += f"\n- Location: {location}" | |
| if month: extra += f"\n- Month: {month}" | |
| prompt = AUDIO_PROMPT.format(**features, extra=extra) | |
| models = check_ollama_models() | |
| response = "" | |
| # Use TEXT model for audio (NOT vision!) | |
| if models["text_model"]: | |
| yield f'<div class="processing">π¦ Using {models["text_model"]}...</div>{features_html}' | |
| response = call_ollama_text(prompt, models["text_model"]) | |
| if not response: | |
| yield f'<div class="processing">βοΈ Using HuggingFace...</div>{features_html}' | |
| response = call_hf_text(prompt) | |
| birds, summary = parse_bird_response(response) | |
| if not birds: | |
| yield f'''<div class="error"> | |
| <b>Could not identify bird from audio</b> | |
| <p>Try a clearer recording with less background noise.</p> | |
| {features_html} | |
| </div>''' | |
| return | |
| result = f'''<div class="success"> | |
| <h3>π¦ {len(birds)} Bird(s) Identified!</h3> | |
| <p>{summary}</p> | |
| </div>{features_html}''' | |
| for i, bird in enumerate(birds, 1): | |
| result += format_bird_card(bird, i) | |
| yield result | |
| except Exception as e: | |
| log(f"Audio error: {traceback.format_exc()}") | |
| yield f'<div class="error">β Error: {str(e)}</div>' | |
| # ================== DESCRIPTION IDENTIFICATION ================== | |
| def identify_description_stream(description: str): | |
| """Identify bird from text description.""" | |
| if not description or len(description.strip()) < 5: | |
| yield '<div class="error">β οΈ Please enter a description</div>' | |
| return | |
| try: | |
| yield '<div class="processing">π Analyzing description...</div>' | |
| prompt = f"""Identify the bird species from this description: | |
| "{description}" | |
| Respond with JSON: | |
| {{"birds": [{{"name": "Species Name", "scientific_name": "...", "confidence": 80, "reason": "..."}}], "summary": "..."}} | |
| Use ACTUAL species names. Return ONLY JSON.""" | |
| models = check_ollama_models() | |
| response = "" | |
| if models["text_model"]: | |
| yield '<div class="processing">π¦ Using local AI...</div>' | |
| response = call_ollama_text(prompt, models["text_model"]) | |
| if not response: | |
| yield '<div class="processing">βοΈ Using HuggingFace...</div>' | |
| response = call_hf_text(prompt) | |
| birds, summary = parse_bird_response(response) | |
| if not birds: | |
| yield '<div class="error"><b>Could not identify bird</b><p>Try adding more details.</p></div>' | |
| return | |
| result = f'''<div class="success"> | |
| <h3>π¦ {len(birds)} Bird(s) Match!</h3> | |
| <p>{summary}</p> | |
| </div>''' | |
| for i, bird in enumerate(birds, 1): | |
| result += format_bird_card(bird, i) | |
| yield result | |
| except Exception as e: | |
| yield f'<div class="error">β Error: {str(e)}</div>' | |
| # ================== UI ================== | |
| def get_status_html(): | |
| """Get status indicator.""" | |
| models = check_ollama_models() | |
| if models["vision_model"]: | |
| return f'<span class="status-dot status-green"></span> LLaVA + {models["text_model"] or "HF"}' | |
| elif models["text_model"]: | |
| return f'<span class="status-dot status-yellow"></span> {models["text_model"]} (no vision)' | |
| elif HF_TOKEN: | |
| return '<span class="status-dot status-yellow"></span> HuggingFace Cloud' | |
| else: | |
| return '<span class="status-dot status-red"></span> Limited Mode' | |
| def create_app(): | |
| with gr.Blocks(title="BirdSense Pro") as demo: | |
| gr.HTML(f"<style>{CSS}</style>") | |
| gr.HTML(f""" | |
| <div class="header"> | |
| <h1>π¦ BirdSense Pro</h1> | |
| <p class="subtitle">AI Bird Identification β’ Audio β’ Image β’ Description</p> | |
| <div class="status">{get_status_html()}</div> | |
| </div>""") | |
| # AUDIO FIRST | |
| with gr.Tab("π΅ Audio"): | |
| gr.HTML('<div class="info-box"><h3>π΅ Audio Identification</h3><p>Upload or record bird calls. Uses text AI to analyze acoustic features.</p></div>') | |
| with gr.Row(): | |
| with gr.Column(): | |
| audio_in = gr.Audio(sources=["upload", "microphone"], type="numpy", label="π€ Audio") | |
| with gr.Row(): | |
| loc = gr.Textbox(label="π Location", placeholder="e.g., Mumbai") | |
| mon = gr.Dropdown(label="π Month", choices=[""] + ["January","February","March","April","May","June","July","August","September","October","November","December"]) | |
| audio_btn = gr.Button("π Identify", variant="primary", size="lg") | |
| with gr.Column(): | |
| audio_out = gr.HTML('<div style="padding:40px;text-align:center;color:#a0aec0">π΅ Upload audio to identify</div>') | |
| audio_btn.click(identify_audio_stream, [audio_in, loc, mon], audio_out) | |
| # IMAGE | |
| with gr.Tab("π· Image"): | |
| gr.HTML('<div class="info-box"><h3>π· Image Identification</h3><p>Upload a photo. Uses LLaVA vision AI to analyze the actual image.</p></div>') | |
| with gr.Row(): | |
| with gr.Column(): | |
| img_in = gr.Image(sources=["upload", "webcam"], type="pil", label="πΈ Photo") | |
| img_btn = gr.Button("π Identify", variant="primary", size="lg") | |
| with gr.Column(): | |
| img_out = gr.HTML('<div style="padding:40px;text-align:center;color:#a0aec0">π· Upload image to identify</div>') | |
| img_btn.click(identify_image_stream, [img_in], img_out) | |
| # DESCRIPTION | |
| with gr.Tab("π Description"): | |
| gr.HTML('<div class="info-box"><h3>π Text Description</h3><p>Describe the bird - colors, size, behavior, sounds.</p></div>') | |
| with gr.Row(): | |
| with gr.Column(): | |
| desc_in = gr.Textbox(label="βοΈ Description", lines=3, placeholder="e.g., Large blue and yellow parrot with long tail") | |
| desc_btn = gr.Button("π Identify", variant="primary", size="lg") | |
| with gr.Column(): | |
| desc_out = gr.HTML('<div style="padding:40px;text-align:center;color:#a0aec0">π Describe a bird</div>') | |
| desc_btn.click(identify_description_stream, [desc_in], desc_out) | |
| gr.HTML('<div style="text-align:center;padding:10px;color:#718096;font-size:0.8rem"><b>BirdSense Pro</b> β’ Local: LLaVA (image) + Llama3.2 (audio/text) β’ Cloud: HuggingFace BLIP</div>') | |
| return demo | |
| if __name__ == "__main__": | |
| log("Starting BirdSense Pro...") | |
| models = check_ollama_models() | |
| log(f"Vision: {models['vision_model']}, Text: {models['text_model']}, HF: {bool(HF_TOKEN)}") | |
| app = create_app() | |
| app.launch(server_name="0.0.0.0", server_port=7860, show_error=True) | |