sohiyiy commited on
Commit
0e96d6b
Β·
verified Β·
1 Parent(s): b2f9387

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +406 -309
app.py CHANGED
@@ -16,11 +16,12 @@ import numpy as np
16
  import scipy.signal as signal
17
  from scipy.ndimage import gaussian_filter1d
18
  from dataclasses import dataclass
19
- from typing import Optional, Tuple, List, Dict
20
  import json
21
  import requests
22
  import re
23
  import urllib.parse
 
24
 
25
  # ================== CONFIG ==================
26
  SAMPLE_RATE = 48000
@@ -30,28 +31,118 @@ OLLAMA_URL = "http://localhost:11434"
30
  OLLAMA_MODELS = ["llama3.2", "phi4:latest", "qwen2.5:3b"] # Priority order
31
 
32
  # HuggingFace Inference API (for cloud deployment)
33
- # Token loaded from environment variable or HuggingFace Spaces secrets
34
- import os
35
  HF_API_TOKEN = os.environ.get("HF_TOKEN", "")
36
  HF_API_MODELS = [
37
  "mistralai/Mistral-7B-Instruct-v0.3",
38
  "google/flan-t5-xxl",
39
- "facebook/opt-1.3b", # Free, no auth needed
40
  ]
41
 
42
 
43
- # ================== DYNAMIC IMAGE SEARCH ==================
44
- """
45
- NO HARDCODED IMAGES!
46
- Fetches bird images dynamically from Wikipedia/Wikimedia Commons
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  """
48
 
 
 
 
49
  def get_wikipedia_image(bird_name: str, scientific_name: str = "") -> str:
50
- """
51
- Dynamically fetch bird image from Wikipedia.
52
- No hardcoding - searches based on LLM output.
53
- """
54
- # Try scientific name first (more accurate)
55
  search_terms = []
56
  if scientific_name:
57
  search_terms.append(scientific_name.replace(" ", "_"))
@@ -60,7 +151,6 @@ def get_wikipedia_image(bird_name: str, scientific_name: str = "") -> str:
60
 
61
  for term in search_terms:
62
  try:
63
- # Wikipedia API to get page image
64
  wiki_url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{urllib.parse.quote(term)}"
65
  resp = requests.get(wiki_url, timeout=5, headers={"User-Agent": "BirdSense/1.0"})
66
 
@@ -68,51 +158,40 @@ def get_wikipedia_image(bird_name: str, scientific_name: str = "") -> str:
68
  data = resp.json()
69
  if "thumbnail" in data and "source" in data["thumbnail"]:
70
  img_url = data["thumbnail"]["source"]
71
- # Get higher resolution
72
  img_url = img_url.replace("/220px-", "/400px-").replace("/320px-", "/400px-")
73
  return img_url
74
  elif "originalimage" in data and "source" in data["originalimage"]:
75
  return data["originalimage"]["source"]
76
- except Exception as e:
77
  continue
78
 
79
- # Fallback: Search Wikimedia Commons
80
  try:
81
- commons_url = f"https://commons.wikimedia.org/w/api.php"
82
  params = {
83
- "action": "query",
84
- "format": "json",
85
- "list": "search",
86
- "srsearch": f"{bird_name} bird",
87
- "srnamespace": "6", # File namespace
88
- "srlimit": "1"
89
  }
90
  resp = requests.get(commons_url, params=params, timeout=5)
91
  if resp.status_code == 200:
92
  data = resp.json()
93
  if data.get("query", {}).get("search"):
94
  file_title = data["query"]["search"][0]["title"]
95
- # Get actual image URL
96
- file_url = f"https://commons.wikimedia.org/w/api.php"
97
  file_params = {
98
- "action": "query",
99
- "format": "json",
100
- "titles": file_title,
101
- "prop": "imageinfo",
102
- "iiprop": "url",
103
- "iiurlwidth": "400"
104
  }
105
- file_resp = requests.get(file_url, params=file_params, timeout=5)
106
  if file_resp.status_code == 200:
107
- file_data = file_resp.json()
108
- pages = file_data.get("query", {}).get("pages", {})
109
  for page in pages.values():
110
  if "imageinfo" in page:
111
  return page["imageinfo"][0].get("thumburl", page["imageinfo"][0].get("url", ""))
112
  except:
113
  pass
114
 
115
- # Final fallback - generic bird silhouette
116
  return "https://upload.wikimedia.org/wikipedia/commons/thumb/5/5a/Bird_icon.svg/200px-Bird_icon.svg.png"
117
 
118
 
@@ -174,12 +253,7 @@ class SAMAudioProcessor:
174
  f, t, Zxx = signal.stft(audio, self.sr, nperseg=2048)
175
  magnitude = np.abs(Zxx)
176
 
177
- bands = [
178
- ("low_freq", 500, 2000),
179
- ("mid_freq", 2000, 5000),
180
- ("high_freq", 5000, 10000),
181
- ]
182
-
183
  detected = []
184
  for band_name, low, high in bands:
185
  band_idx = (f >= low) & (f <= high)
@@ -210,8 +284,40 @@ def get_available_ollama_model() -> Optional[str]:
210
  return None
211
 
212
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  def call_ollama(prompt: str, system: str = None) -> Optional[str]:
214
- """Call local Ollama LLM with best available model."""
215
  model = get_available_ollama_model()
216
  if not model:
217
  return None
@@ -237,8 +343,6 @@ def call_ollama(prompt: str, system: str = None) -> Optional[str]:
237
  def call_hf_inference(prompt: str, system: str = None) -> Optional[str]:
238
  """Call HuggingFace Inference API."""
239
  full_prompt = f"{system}\n\n{prompt}" if system else prompt
240
-
241
- # Truncate prompt if too long
242
  if len(full_prompt) > 4000:
243
  full_prompt = full_prompt[:4000]
244
 
@@ -251,11 +355,7 @@ def call_hf_inference(prompt: str, system: str = None) -> Optional[str]:
251
 
252
  payload = {
253
  "inputs": full_prompt,
254
- "parameters": {
255
- "max_new_tokens": 1000,
256
- "temperature": 0.3,
257
- "return_full_text": False
258
- }
259
  }
260
 
261
  resp = requests.post(url, headers=headers, json=payload, timeout=90)
@@ -267,29 +367,17 @@ def call_hf_inference(prompt: str, system: str = None) -> Optional[str]:
267
  if text and len(text) > 20:
268
  return text
269
  elif isinstance(result, dict):
270
- text = result.get("generated_text", "")
271
- if text:
272
- return text
273
- elif resp.status_code == 503:
274
- continue # Model loading
275
- elif resp.status_code == 401:
276
- continue # Auth required, try next
277
-
278
  except Exception as e:
279
- print(f"HF API error for {model}: {e}")
280
  continue
281
-
282
  return None
283
 
284
 
285
  def call_llm(prompt: str, system: str = None) -> Optional[str]:
286
  """Call LLM - Ollama first, HuggingFace fallback."""
287
- # Try Ollama first
288
  result = call_ollama(prompt, system)
289
  if result:
290
  return result
291
-
292
- # Fallback to HuggingFace
293
  return call_hf_inference(prompt, system)
294
 
295
 
@@ -318,35 +406,28 @@ class AudioFeatures:
318
  sam_metadata: dict
319
 
320
  def to_prompt(self) -> str:
321
- freq_desc = "very low (<500Hz, large bird like crow/cuckoo)" if self.peak_frequency < 500 else \
322
  "low (500-1500Hz, koel/coucal)" if self.peak_frequency < 1500 else \
323
- "medium (1500-4000Hz, typical songbird)" if self.peak_frequency < 4000 else \
324
- "high (4000-7000Hz, warbler/sunbird)" if self.peak_frequency < 7000 else \
325
- "very high (>7000Hz, alarm call/small bird)"
326
 
327
- return f"""AUDIO ANALYSIS (after SAM-Audio bird call separation):
328
- - Duration: {self.duration:.2f} seconds
329
- - Peak frequency: {self.peak_frequency:.0f} Hz ({freq_desc})
330
- - Frequency range: {self.freq_range[0]:.0f} - {self.freq_range[1]:.0f} Hz
331
- - Spectral centroid: {self.spectral_centroid:.0f} Hz
332
- - Spectral bandwidth: {self.spectral_bandwidth:.0f} Hz
333
- - Call pattern: {"MELODIC (varying pitch)" if self.is_melodic else "MONOTONE (steady pitch)"}
334
- - Repetition: {"REPETITIVE ({:.1f} syllables/sec)".format(self.syllable_rate) if self.is_repetitive else "VARIABLE pattern"}
335
- - Syllable count: {self.num_syllables}
336
- - Signal quality: SNR {self.snr_db:.1f}dB {"(excellent)" if self.snr_db > 20 else "(good)" if self.snr_db > 10 else "(noisy)"}
337
-
338
- SAM-Audio separation: {self.sam_metadata.get('separation_ratio', 0)*100:.0f}% bird call isolated"""
339
 
340
 
341
  def extract_features(audio: np.ndarray, sr: int, sam_metadata: dict) -> AudioFeatures:
342
  """Extract comprehensive audio features."""
343
  duration = len(audio) / sr
344
-
345
- # Spectral analysis
346
  freqs, psd = signal.welch(audio, sr, nperseg=min(4096, len(audio)))
347
  peak_freq = freqs[np.argmax(psd)]
348
 
349
- # Spectral moments
350
  total_power = np.sum(psd) + 1e-10
351
  centroid = np.sum(freqs * psd) / total_power
352
  bandwidth = np.sqrt(np.sum(((freqs - centroid) ** 2) * psd) / total_power)
@@ -375,27 +456,22 @@ def extract_features(audio: np.ndarray, sr: int, sam_metadata: dict) -> AudioFea
375
  for c in chunks:
376
  if len(c) > 512:
377
  _, cpsd = signal.welch(c, sr, nperseg=min(1024, len(c)))
378
- chunk_freqs.append(freqs[np.argmax(cpsd)] if len(cpsd) == len(freqs) else peak_freq)
 
379
  if chunk_freqs:
380
  is_melodic = np.std(chunk_freqs) / (np.mean(chunk_freqs) + 1e-10) > 0.15
381
 
382
- # SNR
383
  noise = np.percentile(np.abs(audio), 5)
384
  sig = np.percentile(np.abs(audio), 95)
385
  snr = 20 * np.log10((sig + 1e-10) / (noise + 1e-10))
386
 
387
  return AudioFeatures(
388
- duration=duration,
389
- peak_frequency=float(peak_freq),
390
  freq_range=(float(freq_low), float(freq_high)),
391
- num_syllables=num_syl,
392
- syllable_rate=float(syl_rate),
393
- is_melodic=is_melodic,
394
- is_repetitive=syl_rate > 3,
395
- snr_db=float(snr),
396
- spectral_centroid=float(centroid),
397
- spectral_bandwidth=float(bandwidth),
398
- sam_metadata=sam_metadata
399
  )
400
 
401
 
@@ -420,154 +496,103 @@ def preprocess_audio(audio_data: np.ndarray, sr: int) -> Tuple[np.ndarray, int]:
420
  return audio_data, sr
421
 
422
 
423
- # ================== IMAGE ANALYSIS (YOLO-style) ==================
424
 
425
  def analyze_image_features(image: np.ndarray) -> Dict:
426
- """
427
- YOLO-inspired image feature extraction.
428
- Analyzes colors, patterns, shapes for bird identification.
429
- """
430
  if len(image.shape) != 3 or image.shape[2] < 3:
431
  return {"error": "Invalid image"}
432
 
433
  h, w = image.shape[:2]
434
-
435
- # Convert to different color spaces
436
  r, g, b = image[:,:,0], image[:,:,1], image[:,:,2]
437
 
438
- # Dominant colors analysis
439
  colors = []
440
  color_regions = []
441
 
442
- # Analyze different regions (head, body, tail approximation)
443
- regions = {
444
- "upper": image[:h//3, :, :],
445
- "middle": image[h//3:2*h//3, :, :],
446
- "lower": image[2*h//3:, :, :]
447
- }
448
 
449
  for region_name, region in regions.items():
450
  rr, rg, rb = np.mean(region[:,:,0]), np.mean(region[:,:,1]), np.mean(region[:,:,2])
451
-
452
- # Detect specific colors
453
  region_colors = []
454
- if rr > 180 and rg > 180 and rb > 180:
455
- region_colors.append("white")
456
- if rr < 60 and rg < 60 and rb < 60:
457
- region_colors.append("black")
458
- if rg > rr * 1.2 and rg > rb * 1.2:
459
- region_colors.append("green")
460
- if rb > rr * 1.2 and rb > rg * 1.1:
461
- region_colors.append("blue")
462
  if rr > rg * 1.3 and rr > rb * 1.3:
463
- if rr > 200:
464
- region_colors.append("red")
465
- else:
466
- region_colors.append("brown")
467
- if rr > 150 and rg > 100 and rb < 80:
468
- region_colors.append("yellow/orange")
469
  if abs(rr - rg) < 30 and abs(rg - rb) < 30:
470
- if rr > 150:
471
- region_colors.append("grey/white")
472
- else:
473
- region_colors.append("grey")
474
 
475
  if region_colors:
476
  color_regions.append(f"{region_name}: {', '.join(region_colors)}")
477
  colors.extend(region_colors)
478
 
479
- # Unique colors
480
- unique_colors = list(set(colors))
481
-
482
- # Pattern detection (simplified)
483
  gray = 0.299 * r + 0.587 * g + 0.114 * b
484
  edges = np.abs(np.gradient(gray, axis=0)) + np.abs(np.gradient(gray, axis=1))
485
  pattern_intensity = np.mean(edges)
486
 
487
- has_stripes = pattern_intensity > 20
488
- has_spots = False # Would need more sophisticated detection
489
-
490
- # Size estimation from aspect ratio
491
- aspect = w / h
492
- size_guess = "medium"
493
- if aspect > 1.5:
494
- size_guess = "long-tailed"
495
- elif aspect < 0.7:
496
- size_guess = "compact/round"
497
-
498
  return {
499
- "colors": unique_colors,
500
  "color_regions": color_regions,
501
- "has_patterns": has_stripes,
502
- "size_hint": size_guess,
503
- "pattern_intensity": pattern_intensity
504
  }
505
 
506
 
507
  # ================== LLM PROMPTS ==================
508
 
509
- SYSTEM_PROMPT = """You are an expert ornithologist specializing in bird identification. You have encyclopedic knowledge of 10,000+ bird species worldwide, with particular expertise in Indian birds (1,300+ species).
510
 
511
- CRITICAL RULES:
512
- 1. Identify birds based ONLY on the provided audio/image/description features
513
- 2. List ALL possible matching species with confidence scores
514
- 3. ALWAYS provide scientific names (they are REQUIRED for image lookup)
515
- 4. Be specific about WHY each bird matches the features
516
- 5. Consider geographic context (India-focused)
517
 
518
- Your response MUST be valid JSON in this exact format:
519
  {
520
  "birds": [
521
  {
522
- "name": "Common English Name",
523
  "scientific_name": "Genus species",
524
  "confidence": 85,
525
- "reasoning": "Detailed explanation of why this bird matches"
526
  }
527
  ],
528
- "analysis": "Overall analysis of the recording/image",
529
- "habitat_notes": "Relevant habitat information"
530
- }
531
-
532
- IMPORTANT: The scientific_name is REQUIRED and must be accurate - it's used to fetch the correct bird image."""
533
 
534
 
535
  def parse_llm_response(response: str) -> Tuple[List[Dict], str]:
536
  """Parse LLM JSON response."""
537
- birds = []
538
- analysis = ""
539
-
540
  if not response:
541
- return birds, "No response from LLM"
542
 
543
- # Try to extract JSON
544
  try:
545
- # Find JSON block
546
  json_match = re.search(r'\{[\s\S]*\}', response)
547
  if json_match:
548
  data = json.loads(json_match.group())
549
  birds = data.get("birds", [])
550
  analysis = data.get("analysis", "")
551
- except json.JSONDecodeError:
552
- # Try to parse structured text
553
  pass
554
-
555
  return birds, analysis
556
 
557
 
558
  def format_results(birds: List[Dict], analysis: str, extra_info: str = "") -> str:
559
- """Format results with DYNAMIC images (no hardcoding)."""
560
- output = "## 🐦 Birds Identified\n\n"
561
 
562
  if analysis:
563
  output += f"*{analysis}*\n\n"
564
-
565
  if extra_info:
566
- output += f"{extra_info}\n\n"
567
 
568
  if not birds:
569
- output += "### No birds identified. Please try with clearer audio/image.\n"
570
- return output
571
 
572
  for i, bird in enumerate(birds, 1):
573
  name = bird.get("name", "Unknown")
@@ -575,264 +600,336 @@ def format_results(birds: List[Dict], analysis: str, extra_info: str = "") -> st
575
  conf = bird.get("confidence", 0)
576
  reason = bird.get("reasoning", "")
577
 
578
- # DYNAMIC image fetch - NO HARDCODING
579
  img_url = get_wikipedia_image(name, scientific)
580
-
581
  badge = "🟒 HIGH" if conf >= 80 else "🟑 MEDIUM" if conf >= 60 else "πŸ”΄ LOW"
582
 
583
- output += f"""
584
- ---
585
 
586
- ### {i}. **{name}** ({conf}%) {badge}
587
 
588
  ![{name}]({img_url})
589
 
590
- **Scientific Name:** *{scientific}*
591
 
592
- **Why this bird:** {reason}
593
 
594
  """
595
-
596
  return output
597
 
598
 
599
- # ================== MAIN FUNCTIONS ==================
600
 
601
- def identify_audio(audio, location: str = "", month: str = ""):
602
- """Identify bird from audio."""
603
  if audio is None:
604
- return "### ⚠️ Please record or upload audio"
 
605
 
606
  status = get_llm_status()
 
607
 
608
  try:
609
  sr, audio_data = audio
610
  audio_data, sr = preprocess_audio(audio_data, sr)
611
 
612
- # SAM-Audio preprocessing
 
613
  bird_audio, sam_metadata = sam_audio.separate_bird_calls(audio_data)
614
  multi_sources = sam_audio.detect_multiple_birds(bird_audio)
615
 
616
- # Extract features
 
617
  features = extract_features(bird_audio, sr, sam_metadata)
618
 
619
- # Build prompt
620
- prompt = f"""Identify the bird(s) in this audio recording:
621
 
622
  {features.to_prompt()}
623
 
624
  """
625
  if location:
626
- prompt += f"LOCATION: {location}\n"
627
  if month:
628
- prompt += f"MONTH: {month}\n"
629
-
630
  if len(multi_sources) > 1:
631
- prompt += f"\nNOTE: Multiple frequency bands detected ({len(multi_sources)}) - likely multiple birds calling!\n"
632
 
633
- prompt += "\nIdentify ALL birds that match these audio characteristics. Provide scientific names."
634
 
635
- response = call_llm(prompt, SYSTEM_PROMPT)
636
- birds, analysis = parse_llm_response(response)
637
 
638
- extra_info = f"**πŸ”Š SAM-Audio:** {sam_metadata.get('separation_ratio', 0)*100:.0f}% separation | **LLM:** {status}"
 
 
 
 
 
 
 
 
639
 
640
  if birds:
641
- return format_results(birds, analysis, extra_info)
642
  else:
643
- return f"""### ⚠️ Could not identify bird
644
 
645
- **Audio Features Detected:**
646
  {features.to_prompt()}
647
 
648
- **LLM Response:** {response[:500] if response else 'No response'}
 
 
 
649
 
650
  **Status:** {status}
651
-
652
- Please ensure Ollama is running with llama3.2 or phi4 model."""
653
 
654
  except Exception as e:
655
- return f"### ❌ Error: {str(e)}\n\n**LLM:** {status}"
656
 
657
 
658
- def identify_image(image):
659
- """Identify bird from image using YOLO-style analysis + LLM."""
660
  if image is None:
661
- return "### ⚠️ Please upload an image"
 
662
 
663
  status = get_llm_status()
 
664
 
665
  try:
666
  img = np.array(image) if not isinstance(image, np.ndarray) else image
667
-
668
- # YOLO-style feature extraction
669
  features = analyze_image_features(img)
670
 
671
  if "error" in features:
672
- return f"### ⚠️ {features['error']}"
 
 
 
673
 
674
- # Build detailed prompt
675
- prompt = f"""Identify the bird in this image based on visual analysis:
676
 
677
- IMAGE ANALYSIS (YOLO-style feature extraction):
678
- - Detected colors: {', '.join(features['colors']) if features['colors'] else 'mixed/unclear'}
679
- - Color distribution by region:
680
- {chr(10).join(' - ' + r for r in features['color_regions'])}
681
- - Pattern detected: {'Yes (striped/patterned)' if features['has_patterns'] else 'No distinct patterns'}
682
- - Body shape: {features['size_hint']}
683
 
684
- Based on these visual features, identify ALL possible Indian bird species that match.
685
- Consider color patterns, size, and shape carefully.
686
- IMPORTANT: Provide accurate scientific names for each bird."""
687
 
688
- response = call_llm(prompt, SYSTEM_PROMPT)
689
- birds, analysis = parse_llm_response(response)
 
 
690
 
691
- extra_info = f"**πŸ“· Visual Analysis:** {', '.join(features['colors'])} | **LLM:** {status}"
 
692
 
693
  if birds:
694
- return format_results(birds, analysis, extra_info)
695
  else:
696
- return f"""### ⚠️ Could not identify bird from image
697
-
698
- **Detected Colors:** {', '.join(features['colors'])}
699
- **Color Regions:** {'; '.join(features['color_regions'])}
700
-
701
- **LLM Response:** {response[:500] if response else 'No response'}
702
-
703
- **Status:** {status}"""
704
 
705
  except Exception as e:
706
- return f"### ❌ Error: {str(e)}\n\n**LLM:** {status}"
707
 
708
 
709
- def identify_description(description: str):
710
- """Identify bird from description."""
711
  if not description or len(description.strip()) < 5:
712
- return "### ⚠️ Please enter a description"
 
713
 
714
  status = get_llm_status()
 
715
 
716
  prompt = f"""Identify the bird(s) from this description:
717
 
718
- USER DESCRIPTION:
719
- {description}
720
 
721
- Focus on Indian birds. Match the description to specific species.
722
- IMPORTANT: Provide accurate scientific names for image lookup."""
723
 
724
- response = call_llm(prompt, SYSTEM_PROMPT)
725
- birds, analysis = parse_llm_response(response)
 
 
726
 
727
- extra_info = f"**πŸ“ Description Match** | **LLM:** {status}"
728
 
729
  if birds:
730
- return format_results(birds, analysis, extra_info)
731
  else:
732
- return f"""### ⚠️ Could not identify bird
733
-
734
- **LLM Response:** {response[:500] if response else 'No response'}
735
-
736
- **Status:** {status}"""
737
 
738
 
739
  # ================== GRADIO UI ==================
740
 
741
- with gr.Blocks(title="🐦 BirdSense Pro", theme=gr.themes.Soft()) as demo:
 
 
 
742
 
 
743
  gr.HTML("""
744
- <div style="text-align: center; background: linear-gradient(135deg, #1a4d2e 0%, #2d5a3e 50%, #1a4d2e 100%); padding: 2rem; border-radius: 16px; margin-bottom: 1rem;">
745
- <h1 style="color: #4ade80; font-size: 2.5rem; margin: 0;">🐦 BirdSense Pro</h1>
746
- <p style="color: #94a3b8; font-size: 1.1rem;">META SAM-Audio + Llama3.2/Phi4 LLM</p>
747
- <p style="color: #64748b; font-size: 0.9rem;">Dynamic Wikipedia Images β€’ No Hardcoding β€’ 10,000+ Species</p>
748
  </div>
749
  """)
750
 
751
- gr.Markdown(f"### Current LLM: {get_llm_status()}")
 
 
 
 
 
752
 
 
753
  with gr.Tabs():
 
 
754
  with gr.Tab("🎀 Audio Identification"):
755
  gr.Markdown("""
756
- **How it works:**
757
- 1. SAM-Audio separates bird calls from background noise
758
- 2. Features (frequency, syllables, pattern) are extracted
759
- 3. LLM identifies matching species from 10,000+ birds
760
- 4. Images are fetched dynamically from Wikipedia
761
  """)
762
 
763
  with gr.Row():
764
  with gr.Column(scale=1):
765
- audio_in = gr.Audio(sources=["microphone", "upload"], type="numpy", label="🎀 Record or Upload")
 
 
 
 
766
  with gr.Row():
767
- loc = gr.Textbox(label="πŸ“ Location", placeholder="Western Ghats, Mumbai...")
768
- month = gr.Dropdown(label="πŸ“… Month", choices=[""] + [
769
- "January", "February", "March", "April", "May", "June",
770
- "July", "August", "September", "October", "November", "December"
771
- ])
772
- audio_btn = gr.Button("πŸ” Identify Bird", variant="primary", size="lg")
 
 
 
 
 
 
 
 
 
773
 
774
  with gr.Column(scale=2):
775
- audio_out = gr.Markdown()
 
 
 
776
 
777
- audio_btn.click(identify_audio, [audio_in, loc, month], audio_out)
778
-
 
 
 
 
 
 
779
  with gr.Tab("πŸ“· Image Identification"):
780
  gr.Markdown("""
781
- **YOLO-style visual analysis:**
782
- - Extracts colors from different regions (head, body, tail)
783
- - Detects patterns and shapes
784
- - LLM matches to bird species
785
  """)
786
 
787
  with gr.Row():
788
  with gr.Column(scale=1):
789
- img_in = gr.Image(sources=["upload", "webcam"], type="numpy", label="πŸ“· Upload or Capture")
790
- img_btn = gr.Button("πŸ” Identify Bird", variant="primary", size="lg")
 
 
 
 
 
 
 
 
 
 
791
  with gr.Column(scale=2):
792
- img_out = gr.Markdown()
 
 
 
793
 
794
- img_btn.click(identify_image, [img_in], img_out)
 
 
 
 
795
 
 
796
  with gr.Tab("πŸ“ Description"):
 
 
 
 
 
797
  with gr.Row():
798
  with gr.Column(scale=1):
799
- desc_in = gr.Textbox(
800
- label="Describe the bird",
801
- lines=4,
802
- placeholder="Example: Small green bird with red forehead, making repetitive 'tuk-tuk' sound, seen in garden"
803
  )
804
- desc_btn = gr.Button("πŸ” Identify Bird", variant="primary", size="lg")
 
 
 
 
 
 
805
  with gr.Column(scale=2):
806
- desc_out = gr.Markdown()
 
 
 
807
 
808
- desc_btn.click(identify_description, [desc_in], desc_out)
 
 
 
 
809
 
 
810
  with gr.Tab("ℹ️ About"):
811
  gr.Markdown("""
812
- ## 🐦 BirdSense Pro - Technical Details
813
-
814
- ### No Hardcoding Policy
815
- - **Images**: Dynamically fetched from Wikipedia based on LLM-provided scientific names
816
- - **Species**: LLM has knowledge of 10,000+ bird species, not limited to a fixed list
817
- - **Identification**: Pure AI reasoning, no lookup tables
818
-
819
- ### Models Used
820
- - **Audio**: META SAM-Audio style preprocessing (500-10000 Hz bird call isolation)
821
- - **LLM**: Llama3.2 / Phi4 (local) or Mistral-7B (cloud)
822
- - **Image**: YOLO-inspired color/pattern extraction β†’ LLM reasoning
823
-
824
- ### API Endpoints
825
- - **Local**: Ollama at localhost:11434
826
- - **Cloud**: HuggingFace Inference API (fallback)
827
 
828
  ### CSCR Initiative
829
  Open-source bird identification for researchers in India.
830
  """)
831
 
 
832
  gr.HTML("""
833
- <div style="text-align: center; padding: 1rem; margin-top: 1rem; border-top: 1px solid #334155;">
834
- <p style="color: #4ade80;">🐦 BirdSense Pro - CSCR Initiative</p>
835
- <p style="color: #64748b; font-size: 0.8rem;">Dynamic Images β€’ No Hardcoding β€’ LLM-Powered</p>
836
  </div>
837
  """)
838
 
@@ -840,5 +937,5 @@ Open-source bird identification for researchers in India.
840
  if __name__ == "__main__":
841
  print(f"\n🐦 BirdSense Pro")
842
  print(f"LLM: {get_llm_status()}")
843
- print("Starting server...")
844
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
16
  import scipy.signal as signal
17
  from scipy.ndimage import gaussian_filter1d
18
  from dataclasses import dataclass
19
+ from typing import Optional, Tuple, List, Dict, Generator
20
  import json
21
  import requests
22
  import re
23
  import urllib.parse
24
+ import os
25
 
26
  # ================== CONFIG ==================
27
  SAMPLE_RATE = 48000
 
31
  OLLAMA_MODELS = ["llama3.2", "phi4:latest", "qwen2.5:3b"] # Priority order
32
 
33
  # HuggingFace Inference API (for cloud deployment)
 
 
34
  HF_API_TOKEN = os.environ.get("HF_TOKEN", "")
35
  HF_API_MODELS = [
36
  "mistralai/Mistral-7B-Instruct-v0.3",
37
  "google/flan-t5-xxl",
38
+ "facebook/opt-1.3b",
39
  ]
40
 
41
 
42
+ # ================== CUSTOM CSS ==================
43
+ CUSTOM_CSS = """
44
+ @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&family=JetBrains+Mono:wght@400;500&display=swap');
45
+
46
+ * {
47
+ font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif !important;
48
+ }
49
+
50
+ .gradio-container {
51
+ max-width: 1400px !important;
52
+ margin: 0 auto !important;
53
+ padding: 20px !important;
54
+ }
55
+
56
+ h1, h2, h3, h4, h5, h6 {
57
+ font-family: 'Inter', sans-serif !important;
58
+ font-weight: 600 !important;
59
+ letter-spacing: -0.02em !important;
60
+ }
61
+
62
+ .header-banner {
63
+ background: linear-gradient(135deg, #0f2e1f 0%, #1a4d2e 50%, #0f2e1f 100%) !important;
64
+ padding: 2.5rem !important;
65
+ border-radius: 20px !important;
66
+ margin-bottom: 1.5rem !important;
67
+ box-shadow: 0 10px 40px rgba(0,0,0,0.3) !important;
68
+ }
69
+
70
+ .header-banner h1 {
71
+ color: #4ade80 !important;
72
+ font-size: 3rem !important;
73
+ font-weight: 700 !important;
74
+ margin: 0 0 0.5rem 0 !important;
75
+ text-shadow: 0 2px 10px rgba(74, 222, 128, 0.3) !important;
76
+ }
77
+
78
+ .header-banner p {
79
+ margin: 0.3rem 0 !important;
80
+ line-height: 1.5 !important;
81
+ }
82
+
83
+ .llm-status {
84
+ background: #1e293b !important;
85
+ padding: 12px 20px !important;
86
+ border-radius: 12px !important;
87
+ margin-bottom: 1rem !important;
88
+ font-weight: 500 !important;
89
+ font-size: 1rem !important;
90
+ }
91
+
92
+ .tab-nav button {
93
+ font-size: 1rem !important;
94
+ font-weight: 500 !important;
95
+ padding: 12px 24px !important;
96
+ }
97
+
98
+ .primary-btn {
99
+ background: linear-gradient(135deg, #4ade80 0%, #22c55e 100%) !important;
100
+ color: #0f172a !important;
101
+ font-weight: 600 !important;
102
+ font-size: 1.1rem !important;
103
+ padding: 16px 32px !important;
104
+ border-radius: 12px !important;
105
+ border: none !important;
106
+ cursor: pointer !important;
107
+ transition: all 0.2s ease !important;
108
+ box-shadow: 0 4px 15px rgba(74, 222, 128, 0.3) !important;
109
+ }
110
+
111
+ .primary-btn:hover {
112
+ transform: translateY(-2px) !important;
113
+ box-shadow: 0 6px 20px rgba(74, 222, 128, 0.4) !important;
114
+ }
115
+
116
+ .result-box {
117
+ background: #1e293b !important;
118
+ border-radius: 16px !important;
119
+ padding: 24px !important;
120
+ min-height: 400px !important;
121
+ }
122
+
123
+ textarea, input[type="text"] {
124
+ font-family: 'Inter', sans-serif !important;
125
+ font-size: 1rem !important;
126
+ border-radius: 10px !important;
127
+ }
128
+
129
+ code, pre {
130
+ font-family: 'JetBrains Mono', monospace !important;
131
+ }
132
+
133
+ .footer {
134
+ text-align: center !important;
135
+ padding: 1.5rem !important;
136
+ margin-top: 2rem !important;
137
+ border-top: 1px solid #334155 !important;
138
+ }
139
  """
140
 
141
+
142
+ # ================== DYNAMIC IMAGE SEARCH ==================
143
+
144
  def get_wikipedia_image(bird_name: str, scientific_name: str = "") -> str:
145
+ """Dynamically fetch bird image from Wikipedia. No hardcoding."""
 
 
 
 
146
  search_terms = []
147
  if scientific_name:
148
  search_terms.append(scientific_name.replace(" ", "_"))
 
151
 
152
  for term in search_terms:
153
  try:
 
154
  wiki_url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{urllib.parse.quote(term)}"
155
  resp = requests.get(wiki_url, timeout=5, headers={"User-Agent": "BirdSense/1.0"})
156
 
 
158
  data = resp.json()
159
  if "thumbnail" in data and "source" in data["thumbnail"]:
160
  img_url = data["thumbnail"]["source"]
 
161
  img_url = img_url.replace("/220px-", "/400px-").replace("/320px-", "/400px-")
162
  return img_url
163
  elif "originalimage" in data and "source" in data["originalimage"]:
164
  return data["originalimage"]["source"]
165
+ except:
166
  continue
167
 
168
+ # Fallback: Wikimedia Commons search
169
  try:
170
+ commons_url = "https://commons.wikimedia.org/w/api.php"
171
  params = {
172
+ "action": "query", "format": "json",
173
+ "list": "search", "srsearch": f"{bird_name} bird",
174
+ "srnamespace": "6", "srlimit": "1"
 
 
 
175
  }
176
  resp = requests.get(commons_url, params=params, timeout=5)
177
  if resp.status_code == 200:
178
  data = resp.json()
179
  if data.get("query", {}).get("search"):
180
  file_title = data["query"]["search"][0]["title"]
 
 
181
  file_params = {
182
+ "action": "query", "format": "json",
183
+ "titles": file_title, "prop": "imageinfo",
184
+ "iiprop": "url", "iiurlwidth": "400"
 
 
 
185
  }
186
+ file_resp = requests.get(commons_url, params=file_params, timeout=5)
187
  if file_resp.status_code == 200:
188
+ pages = file_resp.json().get("query", {}).get("pages", {})
 
189
  for page in pages.values():
190
  if "imageinfo" in page:
191
  return page["imageinfo"][0].get("thumburl", page["imageinfo"][0].get("url", ""))
192
  except:
193
  pass
194
 
 
195
  return "https://upload.wikimedia.org/wikipedia/commons/thumb/5/5a/Bird_icon.svg/200px-Bird_icon.svg.png"
196
 
197
 
 
253
  f, t, Zxx = signal.stft(audio, self.sr, nperseg=2048)
254
  magnitude = np.abs(Zxx)
255
 
256
+ bands = [("low", 500, 2000), ("mid", 2000, 5000), ("high", 5000, 10000)]
 
 
 
 
 
257
  detected = []
258
  for band_name, low, high in bands:
259
  band_idx = (f >= low) & (f <= high)
 
284
  return None
285
 
286
 
287
+ def call_ollama_stream(prompt: str, system: str = None) -> Generator[str, None, None]:
288
+ """Call local Ollama LLM with streaming."""
289
+ model = get_available_ollama_model()
290
+ if not model:
291
+ yield "⚠️ Ollama not available"
292
+ return
293
+
294
+ payload = {
295
+ "model": model,
296
+ "prompt": prompt,
297
+ "stream": True,
298
+ "options": {"temperature": 0.2, "num_predict": 2000}
299
+ }
300
+ if system:
301
+ payload["system"] = system
302
+
303
+ try:
304
+ with requests.post(f"{OLLAMA_URL}/api/generate", json=payload, stream=True, timeout=180) as r:
305
+ full_response = ""
306
+ for line in r.iter_lines():
307
+ if line:
308
+ try:
309
+ data = json.loads(line)
310
+ chunk = data.get("response", "")
311
+ full_response += chunk
312
+ yield full_response
313
+ except:
314
+ continue
315
+ except Exception as e:
316
+ yield f"Error: {e}"
317
+
318
+
319
  def call_ollama(prompt: str, system: str = None) -> Optional[str]:
320
+ """Call local Ollama LLM (non-streaming)."""
321
  model = get_available_ollama_model()
322
  if not model:
323
  return None
 
343
  def call_hf_inference(prompt: str, system: str = None) -> Optional[str]:
344
  """Call HuggingFace Inference API."""
345
  full_prompt = f"{system}\n\n{prompt}" if system else prompt
 
 
346
  if len(full_prompt) > 4000:
347
  full_prompt = full_prompt[:4000]
348
 
 
355
 
356
  payload = {
357
  "inputs": full_prompt,
358
+ "parameters": {"max_new_tokens": 1000, "temperature": 0.3, "return_full_text": False}
 
 
 
 
359
  }
360
 
361
  resp = requests.post(url, headers=headers, json=payload, timeout=90)
 
367
  if text and len(text) > 20:
368
  return text
369
  elif isinstance(result, dict):
370
+ return result.get("generated_text", "")
 
 
 
 
 
 
 
371
  except Exception as e:
 
372
  continue
 
373
  return None
374
 
375
 
376
  def call_llm(prompt: str, system: str = None) -> Optional[str]:
377
  """Call LLM - Ollama first, HuggingFace fallback."""
 
378
  result = call_ollama(prompt, system)
379
  if result:
380
  return result
 
 
381
  return call_hf_inference(prompt, system)
382
 
383
 
 
406
  sam_metadata: dict
407
 
408
  def to_prompt(self) -> str:
409
+ freq_desc = "very low (<500Hz, large bird)" if self.peak_frequency < 500 else \
410
  "low (500-1500Hz, koel/coucal)" if self.peak_frequency < 1500 else \
411
+ "medium (1500-4000Hz, songbird)" if self.peak_frequency < 4000 else \
412
+ "high (4000-7000Hz, warbler)" if self.peak_frequency < 7000 else \
413
+ "very high (>7000Hz, alarm call)"
414
 
415
+ return f"""AUDIO ANALYSIS (SAM-Audio processed):
416
+ β€’ Duration: {self.duration:.2f}s
417
+ β€’ Peak frequency: {self.peak_frequency:.0f} Hz ({freq_desc})
418
+ β€’ Frequency range: {self.freq_range[0]:.0f} - {self.freq_range[1]:.0f} Hz
419
+ β€’ Pattern: {"MELODIC" if self.is_melodic else "MONOTONE"}, {"REPETITIVE ({:.1f}/sec)".format(self.syllable_rate) if self.is_repetitive else "VARIABLE"}
420
+ β€’ Syllables: {self.num_syllables}
421
+ β€’ Quality: SNR {self.snr_db:.0f}dB
422
+ β€’ SAM separation: {self.sam_metadata.get('separation_ratio', 0)*100:.0f}%"""
 
 
 
 
423
 
424
 
425
  def extract_features(audio: np.ndarray, sr: int, sam_metadata: dict) -> AudioFeatures:
426
  """Extract comprehensive audio features."""
427
  duration = len(audio) / sr
 
 
428
  freqs, psd = signal.welch(audio, sr, nperseg=min(4096, len(audio)))
429
  peak_freq = freqs[np.argmax(psd)]
430
 
 
431
  total_power = np.sum(psd) + 1e-10
432
  centroid = np.sum(freqs * psd) / total_power
433
  bandwidth = np.sqrt(np.sum(((freqs - centroid) ** 2) * psd) / total_power)
 
456
  for c in chunks:
457
  if len(c) > 512:
458
  _, cpsd = signal.welch(c, sr, nperseg=min(1024, len(c)))
459
+ if len(cpsd) == len(freqs):
460
+ chunk_freqs.append(freqs[np.argmax(cpsd)])
461
  if chunk_freqs:
462
  is_melodic = np.std(chunk_freqs) / (np.mean(chunk_freqs) + 1e-10) > 0.15
463
 
 
464
  noise = np.percentile(np.abs(audio), 5)
465
  sig = np.percentile(np.abs(audio), 95)
466
  snr = 20 * np.log10((sig + 1e-10) / (noise + 1e-10))
467
 
468
  return AudioFeatures(
469
+ duration=duration, peak_frequency=float(peak_freq),
 
470
  freq_range=(float(freq_low), float(freq_high)),
471
+ num_syllables=num_syl, syllable_rate=float(syl_rate),
472
+ is_melodic=is_melodic, is_repetitive=syl_rate > 3,
473
+ snr_db=float(snr), spectral_centroid=float(centroid),
474
+ spectral_bandwidth=float(bandwidth), sam_metadata=sam_metadata
 
 
 
 
475
  )
476
 
477
 
 
496
  return audio_data, sr
497
 
498
 
499
+ # ================== IMAGE ANALYSIS ==================
500
 
501
  def analyze_image_features(image: np.ndarray) -> Dict:
502
+ """YOLO-inspired image feature extraction."""
 
 
 
503
  if len(image.shape) != 3 or image.shape[2] < 3:
504
  return {"error": "Invalid image"}
505
 
506
  h, w = image.shape[:2]
 
 
507
  r, g, b = image[:,:,0], image[:,:,1], image[:,:,2]
508
 
 
509
  colors = []
510
  color_regions = []
511
 
512
+ regions = {"upper": image[:h//3, :, :], "middle": image[h//3:2*h//3, :, :], "lower": image[2*h//3:, :, :]}
 
 
 
 
 
513
 
514
  for region_name, region in regions.items():
515
  rr, rg, rb = np.mean(region[:,:,0]), np.mean(region[:,:,1]), np.mean(region[:,:,2])
 
 
516
  region_colors = []
517
+
518
+ if rr > 180 and rg > 180 and rb > 180: region_colors.append("white")
519
+ if rr < 60 and rg < 60 and rb < 60: region_colors.append("black")
520
+ if rg > rr * 1.2 and rg > rb * 1.2: region_colors.append("green")
521
+ if rb > rr * 1.2 and rb > rg * 1.1: region_colors.append("blue")
 
 
 
522
  if rr > rg * 1.3 and rr > rb * 1.3:
523
+ region_colors.append("red" if rr > 200 else "brown")
524
+ if rr > 150 and rg > 100 and rb < 80: region_colors.append("yellow/orange")
 
 
 
 
525
  if abs(rr - rg) < 30 and abs(rg - rb) < 30:
526
+ region_colors.append("grey/white" if rr > 150 else "grey")
 
 
 
527
 
528
  if region_colors:
529
  color_regions.append(f"{region_name}: {', '.join(region_colors)}")
530
  colors.extend(region_colors)
531
 
 
 
 
 
532
  gray = 0.299 * r + 0.587 * g + 0.114 * b
533
  edges = np.abs(np.gradient(gray, axis=0)) + np.abs(np.gradient(gray, axis=1))
534
  pattern_intensity = np.mean(edges)
535
 
 
 
 
 
 
 
 
 
 
 
 
536
  return {
537
+ "colors": list(set(colors)),
538
  "color_regions": color_regions,
539
+ "has_patterns": pattern_intensity > 20,
540
+ "size_hint": "long-tailed" if w/h > 1.5 else "compact" if w/h < 0.7 else "medium"
 
541
  }
542
 
543
 
544
  # ================== LLM PROMPTS ==================
545
 
546
+ SYSTEM_PROMPT = """You are an expert ornithologist with encyclopedic knowledge of 10,000+ bird species worldwide, specializing in Indian birds (1,300+ species).
547
 
548
+ CRITICAL REQUIREMENTS:
549
+ 1. Identify birds based ONLY on the provided features
550
+ 2. ALWAYS include scientific names (REQUIRED for image lookup)
551
+ 3. Provide confidence scores (0-100)
552
+ 4. Explain your reasoning
 
553
 
554
+ RESPOND IN VALID JSON:
555
  {
556
  "birds": [
557
  {
558
+ "name": "Common Name",
559
  "scientific_name": "Genus species",
560
  "confidence": 85,
561
+ "reasoning": "Why this bird matches"
562
  }
563
  ],
564
+ "analysis": "Brief overall analysis"
565
+ }"""
 
 
 
566
 
567
 
568
  def parse_llm_response(response: str) -> Tuple[List[Dict], str]:
569
  """Parse LLM JSON response."""
570
+ birds, analysis = [], ""
 
 
571
  if not response:
572
+ return birds, "No response"
573
 
 
574
  try:
 
575
  json_match = re.search(r'\{[\s\S]*\}', response)
576
  if json_match:
577
  data = json.loads(json_match.group())
578
  birds = data.get("birds", [])
579
  analysis = data.get("analysis", "")
580
+ except:
 
581
  pass
 
582
  return birds, analysis
583
 
584
 
585
  def format_results(birds: List[Dict], analysis: str, extra_info: str = "") -> str:
586
+ """Format results with dynamic images."""
587
+ output = "## 🐦 **Birds Identified**\n\n"
588
 
589
  if analysis:
590
  output += f"*{analysis}*\n\n"
 
591
  if extra_info:
592
+ output += f"πŸ“Š {extra_info}\n\n"
593
 
594
  if not birds:
595
+ return output + "⚠️ **No birds identified.** Please try with clearer audio/image.\n"
 
596
 
597
  for i, bird in enumerate(birds, 1):
598
  name = bird.get("name", "Unknown")
 
600
  conf = bird.get("confidence", 0)
601
  reason = bird.get("reasoning", "")
602
 
 
603
  img_url = get_wikipedia_image(name, scientific)
 
604
  badge = "🟒 HIGH" if conf >= 80 else "🟑 MEDIUM" if conf >= 60 else "πŸ”΄ LOW"
605
 
606
+ output += f"""---
 
607
 
608
+ ### {i}. **{name}** β€” {conf}% {badge}
609
 
610
  ![{name}]({img_url})
611
 
612
+ **Scientific:** *{scientific}*
613
 
614
+ **Reasoning:** {reason}
615
 
616
  """
 
617
  return output
618
 
619
 
620
+ # ================== MAIN FUNCTIONS (WITH STREAMING) ==================
621
 
622
+ def identify_audio_stream(audio, location: str = "", month: str = "") -> Generator[str, None, None]:
623
+ """Identify bird from audio with streaming."""
624
  if audio is None:
625
+ yield "## ⚠️ Please record or upload audio first"
626
+ return
627
 
628
  status = get_llm_status()
629
+ yield f"## πŸ”„ Processing audio...\n\n**LLM:** {status}"
630
 
631
  try:
632
  sr, audio_data = audio
633
  audio_data, sr = preprocess_audio(audio_data, sr)
634
 
635
+ yield f"## πŸ”„ Applying SAM-Audio preprocessing...\n\n**LLM:** {status}"
636
+
637
  bird_audio, sam_metadata = sam_audio.separate_bird_calls(audio_data)
638
  multi_sources = sam_audio.detect_multiple_birds(bird_audio)
639
 
640
+ yield f"## πŸ”„ Extracting features...\n\nSAM separation: {sam_metadata['separation_ratio']*100:.0f}%\n\n**LLM:** {status}"
641
+
642
  features = extract_features(bird_audio, sr, sam_metadata)
643
 
644
+ prompt = f"""Identify the bird(s) in this recording:
 
645
 
646
  {features.to_prompt()}
647
 
648
  """
649
  if location:
650
+ prompt += f"Location: {location}\n"
651
  if month:
652
+ prompt += f"Month: {month}\n"
 
653
  if len(multi_sources) > 1:
654
+ prompt += f"\nMultiple frequency bands active ({len(multi_sources)}) - possibly multiple birds!\n"
655
 
656
+ prompt += "\nProvide scientific names for all matches."
657
 
658
+ yield f"## πŸ”„ Consulting {status}...\n\n{features.to_prompt()}"
 
659
 
660
+ # Stream the response
661
+ full_response = ""
662
+ for chunk in call_ollama_stream(prompt, SYSTEM_PROMPT):
663
+ full_response = chunk
664
+ yield f"## πŸ”„ LLM thinking...\n\n```\n{chunk[:500]}...\n```"
665
+
666
+ # Parse and format final result
667
+ birds, analysis = parse_llm_response(full_response)
668
+ extra_info = f"**SAM-Audio:** {sam_metadata['separation_ratio']*100:.0f}% | **LLM:** {status}"
669
 
670
  if birds:
671
+ yield format_results(birds, analysis, extra_info)
672
  else:
673
+ yield f"""## ⚠️ Could not identify bird
674
 
675
+ **Audio Features:**
676
  {features.to_prompt()}
677
 
678
+ **LLM Response:**
679
+ ```
680
+ {full_response[:800] if full_response else 'No response'}
681
+ ```
682
 
683
  **Status:** {status}
684
+ """
 
685
 
686
  except Exception as e:
687
+ yield f"## ❌ Error: {str(e)}\n\n**LLM:** {status}"
688
 
689
 
690
+ def identify_image_stream(image) -> Generator[str, None, None]:
691
+ """Identify bird from image with streaming."""
692
  if image is None:
693
+ yield "## ⚠️ Please upload an image first"
694
+ return
695
 
696
  status = get_llm_status()
697
+ yield f"## πŸ”„ Analyzing image...\n\n**LLM:** {status}"
698
 
699
  try:
700
  img = np.array(image) if not isinstance(image, np.ndarray) else image
 
 
701
  features = analyze_image_features(img)
702
 
703
  if "error" in features:
704
+ yield f"## ⚠️ {features['error']}"
705
+ return
706
+
707
+ yield f"## πŸ”„ Colors detected: {', '.join(features['colors'])}\n\n**LLM:** {status}"
708
 
709
+ prompt = f"""Identify the bird based on visual analysis:
 
710
 
711
+ DETECTED FEATURES:
712
+ β€’ Colors: {', '.join(features['colors']) if features['colors'] else 'unclear'}
713
+ β€’ Regions: {'; '.join(features['color_regions'])}
714
+ β€’ Patterns: {'Yes' if features['has_patterns'] else 'No'}
715
+ β€’ Shape: {features['size_hint']}
 
716
 
717
+ Identify Indian bird species that match. Provide scientific names."""
 
 
718
 
719
+ full_response = ""
720
+ for chunk in call_ollama_stream(prompt, SYSTEM_PROMPT):
721
+ full_response = chunk
722
+ yield f"## πŸ”„ LLM analyzing...\n\n```\n{chunk[:500]}...\n```"
723
 
724
+ birds, analysis = parse_llm_response(full_response)
725
+ extra_info = f"**Colors:** {', '.join(features['colors'])} | **LLM:** {status}"
726
 
727
  if birds:
728
+ yield format_results(birds, analysis, extra_info)
729
  else:
730
+ yield f"## ⚠️ Could not identify bird\n\n**Colors:** {', '.join(features['colors'])}\n\n**LLM:** {status}"
 
 
 
 
 
 
 
731
 
732
  except Exception as e:
733
+ yield f"## ❌ Error: {str(e)}\n\n**LLM:** {status}"
734
 
735
 
736
+ def identify_description_stream(description: str) -> Generator[str, None, None]:
737
+ """Identify bird from description with streaming."""
738
  if not description or len(description.strip()) < 5:
739
+ yield "## ⚠️ Please enter a description"
740
+ return
741
 
742
  status = get_llm_status()
743
+ yield f"## πŸ”„ Processing description...\n\n**LLM:** {status}"
744
 
745
  prompt = f"""Identify the bird(s) from this description:
746
 
747
+ "{description}"
 
748
 
749
+ Focus on Indian birds. Provide scientific names."""
 
750
 
751
+ full_response = ""
752
+ for chunk in call_ollama_stream(prompt, SYSTEM_PROMPT):
753
+ full_response = chunk
754
+ yield f"## πŸ”„ LLM thinking...\n\n```\n{chunk[:500]}...\n```"
755
 
756
+ birds, analysis = parse_llm_response(full_response)
757
 
758
  if birds:
759
+ yield format_results(birds, analysis, f"**LLM:** {status}")
760
  else:
761
+ yield f"## ⚠️ Could not identify bird\n\n**LLM:** {status}"
 
 
 
 
762
 
763
 
764
  # ================== GRADIO UI ==================
765
 
766
+ with gr.Blocks(
767
+ title="🐦 BirdSense Pro",
768
+ css=CUSTOM_CSS
769
+ ) as demo:
770
 
771
+ # Header
772
  gr.HTML("""
773
+ <div class="header-banner">
774
+ <h1>🐦 BirdSense Pro</h1>
775
+ <p style="color: #94a3b8; font-size: 1.2rem; font-weight: 500;">META SAM-Audio + Llama3.2/Phi4 LLM</p>
776
+ <p style="color: #64748b; font-size: 1rem;">Dynamic Wikipedia Images β€’ No Hardcoding β€’ 10,000+ Species</p>
777
  </div>
778
  """)
779
 
780
+ # LLM Status
781
+ gr.HTML(f"""
782
+ <div class="llm-status">
783
+ <strong>Current LLM:</strong> {get_llm_status()}
784
+ </div>
785
+ """)
786
 
787
+ # Tabs
788
  with gr.Tabs():
789
+
790
+ # Audio Tab
791
  with gr.Tab("🎀 Audio Identification"):
792
  gr.Markdown("""
793
+ ### How it works:
794
+ 1. **SAM-Audio** separates bird calls from background noise
795
+ 2. **Features** (frequency, syllables, pattern) are extracted
796
+ 3. **LLM** identifies matching species from 10,000+ birds
797
+ 4. **Images** are fetched dynamically from Wikipedia
798
  """)
799
 
800
  with gr.Row():
801
  with gr.Column(scale=1):
802
+ audio_input = gr.Audio(
803
+ sources=["microphone", "upload"],
804
+ type="numpy",
805
+ label="🎀 Record or Upload Audio"
806
+ )
807
  with gr.Row():
808
+ location_input = gr.Textbox(
809
+ label="πŸ“ Location (optional)",
810
+ placeholder="e.g., Western Ghats, Mumbai"
811
+ )
812
+ month_input = gr.Dropdown(
813
+ label="πŸ“… Month",
814
+ choices=["", "January", "February", "March", "April", "May", "June",
815
+ "July", "August", "September", "October", "November", "December"]
816
+ )
817
+ audio_button = gr.Button(
818
+ "πŸ” Identify Bird",
819
+ variant="primary",
820
+ size="lg",
821
+ elem_classes=["primary-btn"]
822
+ )
823
 
824
  with gr.Column(scale=2):
825
+ audio_output = gr.Markdown(
826
+ value="*Results will appear here after identification...*",
827
+ elem_classes=["result-box"]
828
+ )
829
 
830
+ # Connect button to function
831
+ audio_button.click(
832
+ fn=identify_audio_stream,
833
+ inputs=[audio_input, location_input, month_input],
834
+ outputs=audio_output
835
+ )
836
+
837
+ # Image Tab
838
  with gr.Tab("πŸ“· Image Identification"):
839
  gr.Markdown("""
840
+ ### YOLO-style visual analysis:
841
+ - Extracts **colors** from different regions (head, body, tail)
842
+ - Detects **patterns** and shapes
843
+ - **LLM** matches to bird species
844
  """)
845
 
846
  with gr.Row():
847
  with gr.Column(scale=1):
848
+ image_input = gr.Image(
849
+ sources=["upload", "webcam"],
850
+ type="numpy",
851
+ label="πŸ“· Upload or Capture Image"
852
+ )
853
+ image_button = gr.Button(
854
+ "πŸ” Identify Bird",
855
+ variant="primary",
856
+ size="lg",
857
+ elem_classes=["primary-btn"]
858
+ )
859
+
860
  with gr.Column(scale=2):
861
+ image_output = gr.Markdown(
862
+ value="*Results will appear here...*",
863
+ elem_classes=["result-box"]
864
+ )
865
 
866
+ image_button.click(
867
+ fn=identify_image_stream,
868
+ inputs=[image_input],
869
+ outputs=image_output
870
+ )
871
 
872
+ # Description Tab
873
  with gr.Tab("πŸ“ Description"):
874
+ gr.Markdown("""
875
+ ### Describe the bird you saw:
876
+ Include colors, size, call sounds, behavior, habitat...
877
+ """)
878
+
879
  with gr.Row():
880
  with gr.Column(scale=1):
881
+ desc_input = gr.Textbox(
882
+ label="πŸ“ Bird Description",
883
+ lines=5,
884
+ placeholder="Example: Small green bird with red forehead, making repetitive 'tuk-tuk' sound, seen in garden near fruit trees"
885
  )
886
+ desc_button = gr.Button(
887
+ "πŸ” Identify Bird",
888
+ variant="primary",
889
+ size="lg",
890
+ elem_classes=["primary-btn"]
891
+ )
892
+
893
  with gr.Column(scale=2):
894
+ desc_output = gr.Markdown(
895
+ value="*Results will appear here...*",
896
+ elem_classes=["result-box"]
897
+ )
898
 
899
+ desc_button.click(
900
+ fn=identify_description_stream,
901
+ inputs=[desc_input],
902
+ outputs=desc_output
903
+ )
904
 
905
+ # About Tab
906
  with gr.Tab("ℹ️ About"):
907
  gr.Markdown("""
908
+ ## 🐦 BirdSense Pro
909
+
910
+ ### Key Features
911
+ - **No Hardcoding** β€” Images fetched dynamically from Wikipedia using scientific names
912
+ - **10,000+ Species** β€” LLM has knowledge of birds worldwide
913
+ - **SAM-Audio** β€” Isolates bird calls from background noise
914
+ - **Multi-Modal** β€” Audio, image, and text identification
915
+
916
+ ### Models
917
+ | Component | Technology |
918
+ |-----------|------------|
919
+ | Audio Preprocessing | META SAM-Audio (500-10000 Hz isolation) |
920
+ | LLM (Local) | Llama3.2 / Phi4 via Ollama |
921
+ | LLM (Cloud) | Mistral-7B via HuggingFace |
922
+ | Image Analysis | YOLO-inspired color/pattern extraction |
923
 
924
  ### CSCR Initiative
925
  Open-source bird identification for researchers in India.
926
  """)
927
 
928
+ # Footer
929
  gr.HTML("""
930
+ <div class="footer">
931
+ <p style="color: #4ade80; font-size: 1.1rem; font-weight: 600;">🐦 BirdSense Pro β€” CSCR Initiative</p>
932
+ <p style="color: #64748b;">Dynamic Images β€’ No Hardcoding β€’ LLM-Powered</p>
933
  </div>
934
  """)
935
 
 
937
  if __name__ == "__main__":
938
  print(f"\n🐦 BirdSense Pro")
939
  print(f"LLM: {get_llm_status()}")
940
+ print("Starting server on http://localhost:7860")
941
  demo.launch(server_name="0.0.0.0", server_port=7860)