sohiyiy commited on
Commit
b85196b
Β·
verified Β·
1 Parent(s): aaf8e92

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +408 -481
app.py CHANGED
@@ -1,14 +1,17 @@
1
  """
2
  🐦 BirdSense Pro - AI Bird Identification
3
- Uses LLM (via HuggingFace Inference API) for TRUE zero-shot identification
4
 
5
- NOT hardcoded - Uses LLM knowledge of 10,000+ bird species!
 
 
6
 
7
  Features:
8
- 1. Audio β†’ LLM Analysis β†’ Bird ID (zero-shot, any species)
9
  2. Image β†’ LLM Vision β†’ Bird ID
10
  3. Description β†’ LLM β†’ Bird ID
11
  4. Streaming responses
 
12
 
13
  CSCR Initiative
14
  """
@@ -22,17 +25,19 @@ from typing import Optional, Tuple, Dict, Any, List, Generator
22
  import json
23
  import os
24
  import requests
 
25
 
26
  # ================== CONFIG ==================
27
  SAMPLE_RATE = 48000
 
 
 
 
 
 
28
  HF_API_URL = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.3"
29
- # Backup models if primary fails
30
- BACKUP_MODELS = [
31
- "https://api-inference.huggingface.co/models/google/flan-t5-xxl",
32
- "https://api-inference.huggingface.co/models/tiiuae/falcon-7b-instruct"
33
- ]
34
 
35
- # Bird images for common species (for display)
36
  BIRD_IMAGES = {
37
  "Asian Koel": "https://upload.wikimedia.org/wikipedia/commons/thumb/7/78/Eudynamys_scolopaceus_-_Koel_male_-_Sukhna_Lake%2C_India.jpg/320px-Eudynamys_scolopaceus_-_Koel_male_-_Sukhna_Lake%2C_India.jpg",
38
  "Indian Cuckoo": "https://upload.wikimedia.org/wikipedia/commons/thumb/6/6b/Cuculus_micropterus.jpg/320px-Cuculus_micropterus.jpg",
@@ -49,14 +54,143 @@ BIRD_IMAGES = {
49
  "Spotted Owlet": "https://upload.wikimedia.org/wikipedia/commons/thumb/9/9d/Spotted_Owlet_%28Athene_brama%29.jpg/320px-Spotted_Owlet_%28Athene_brama%29.jpg",
50
  "Rose-ringed Parakeet": "https://upload.wikimedia.org/wikipedia/commons/thumb/e/e8/Psittacula_krameri_-_male_-_Fuerteventura.jpg/320px-Psittacula_krameri_-_male_-_Fuerteventura.jpg",
51
  "Greater Coucal": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/d6/Greater_Coucal_%28Centropus_sinensis%29_in_Hyderabad%2C_AP_W_IMG_7544.jpg/320px-Greater_Coucal_%28Centropus_sinensis%29_in_Hyderabad%2C_AP_W_IMG_7544.jpg",
 
 
 
 
 
52
  }
53
-
54
  DEFAULT_IMAGE = "https://upload.wikimedia.org/wikipedia/commons/thumb/4/45/Eopsaltria_australis_-_Mogo_Campground.jpg/320px-Eopsaltria_australis_-_Mogo_Campground.jpg"
55
 
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  @dataclass
58
  class AudioFeatures:
59
- """Audio features extracted for LLM analysis."""
60
  duration: float
61
  peak_frequency: float
62
  freq_range: Tuple[float, float]
@@ -69,79 +203,61 @@ class AudioFeatures:
69
  snr_db: float
70
 
71
  def to_description(self) -> str:
72
- """Convert features to natural language for LLM."""
73
- freq_desc = self._describe_frequency()
74
- pattern_desc = self._describe_pattern()
75
 
76
- return f"""Audio recording analysis:
77
  - Duration: {self.duration:.1f} seconds
78
  - Dominant frequency: {self.peak_frequency:.0f} Hz ({freq_desc})
79
  - Frequency range: {self.freq_range[0]:.0f} - {self.freq_range[1]:.0f} Hz
80
- - Call pattern: {pattern_desc}
81
- - Syllables detected: {self.num_syllables} (rate: {self.syllable_rate:.1f} per second)
82
- - Amplitude: {self.amplitude_pattern}
83
- - Recording quality: SNR {self.snr_db:.0f} dB"""
84
-
85
- def _describe_frequency(self) -> str:
86
  f = self.peak_frequency
87
- if f < 500: return "very low, likely large bird (owl, coucal, peacock)"
88
- elif f < 1000: return "low, possibly crow, dove, or large bird"
89
- elif f < 2000: return "low-medium, could be cuckoo, myna, or babbler"
90
- elif f < 4000: return "medium, typical of most songbirds"
91
- elif f < 6000: return "medium-high, warbler or sunbird range"
92
- elif f < 8000: return "high, small passerine"
93
- else: return "very high, insect-like or alarm call"
94
-
95
- def _describe_pattern(self) -> str:
96
- parts = []
97
- if self.is_melodic:
98
- parts.append("melodic/varied pitch")
99
- else:
100
- parts.append("monotone/single pitch")
101
- if self.is_repetitive:
102
- parts.append("repetitive")
103
- else:
104
- parts.append("variable/non-repetitive")
105
- return ", ".join(parts)
106
 
107
 
108
- def extract_audio_features(audio: np.ndarray, sr: int) -> AudioFeatures:
109
- """Extract comprehensive audio features."""
110
  duration = len(audio) / sr
111
  audio = audio / (np.max(np.abs(audio)) + 1e-8)
112
 
113
- # Spectral analysis
114
  freqs, psd = signal.welch(audio, sr, nperseg=min(4096, len(audio)))
115
- peak_idx = np.argmax(psd)
116
- peak_freq = freqs[peak_idx]
117
-
118
  cumsum = np.cumsum(psd) / (np.sum(psd) + 1e-10)
119
  freq_low = freqs[np.searchsorted(cumsum, 0.10)]
120
  freq_high = freqs[np.searchsorted(cumsum, 0.90)]
 
121
 
122
- spectral_centroid = np.sum(freqs * psd) / (np.sum(psd) + 1e-10)
123
-
124
- # Envelope analysis
125
  envelope = np.abs(signal.hilbert(audio))
126
- kernel = int(0.02 * sr)
127
- if kernel > 0:
128
- envelope = gaussian_filter1d(envelope, kernel)
129
 
130
- # Syllable detection
131
  n_fft, hop = 2048, 512
132
  _, _, Zxx = signal.stft(audio, sr, nperseg=n_fft, noverlap=n_fft-hop)
133
  flux = np.sum(np.maximum(0, np.diff(np.abs(Zxx), axis=1)), axis=0)
 
134
  if len(flux) > 0:
135
  flux = flux / (np.max(flux) + 1e-10)
136
- threshold = np.mean(flux) + 0.5 * np.std(flux)
137
- peaks, _ = signal.find_peaks(flux, height=threshold, distance=max(1, int(0.05*sr/hop)))
138
- num_syllables = len(peaks)
139
- else:
140
- num_syllables = 0
141
-
142
- syllable_rate = num_syllables / duration if duration > 0 else 0
143
 
144
- # Melodic detection
145
  is_melodic = False
146
  if len(audio) > sr:
147
  chunks = np.array_split(audio, min(20, max(5, int(duration*4))))
@@ -153,20 +269,16 @@ def extract_audio_features(audio: np.ndarray, sr: int) -> AudioFeatures:
153
  if chunk_freqs:
154
  is_melodic = np.std(chunk_freqs) / (np.mean(chunk_freqs) + 1e-10) > 0.15
155
 
156
- # Repetitive detection
157
- is_repetitive = syllable_rate > 3
158
-
159
  # Amplitude pattern
 
160
  if len(envelope) > 100:
161
  q = len(envelope) // 4
162
- start, end = np.mean(envelope[:q]), np.mean(envelope[-q:])
163
- var = np.std(envelope) / (np.mean(envelope) + 1e-10)
164
- if var > 0.6: amp_pattern = "varied"
165
- elif end > start * 1.3: amp_pattern = "ascending"
166
- elif end < start * 0.7: amp_pattern = "descending"
167
  else: amp_pattern = "steady"
168
- else:
169
- amp_pattern = "unknown"
170
 
171
  # SNR
172
  noise = np.percentile(np.abs(audio), 5)
@@ -177,196 +289,96 @@ def extract_audio_features(audio: np.ndarray, sr: int) -> AudioFeatures:
177
  duration=duration,
178
  peak_frequency=float(peak_freq),
179
  freq_range=(float(freq_low), float(freq_high)),
180
- spectral_centroid=float(spectral_centroid),
181
- num_syllables=num_syllables,
182
- syllable_rate=float(syllable_rate),
183
  is_melodic=is_melodic,
184
- is_repetitive=is_repetitive,
185
  amplitude_pattern=amp_pattern,
186
  snr_db=float(snr)
187
  )
188
 
189
 
190
- def call_llm(prompt: str, system_prompt: str = None) -> str:
191
- """
192
- Call HuggingFace Inference API for LLM response.
193
- Uses free tier - no API key needed for public models.
194
- """
195
- headers = {"Content-Type": "application/json"}
196
-
197
- # Format prompt for instruction model
198
- if system_prompt:
199
- full_prompt = f"<s>[INST] {system_prompt}\n\n{prompt} [/INST]"
200
  else:
201
- full_prompt = f"<s>[INST] {prompt} [/INST]"
202
-
203
- payload = {
204
- "inputs": full_prompt,
205
- "parameters": {
206
- "max_new_tokens": 1000,
207
- "temperature": 0.3,
208
- "return_full_text": False
209
- }
210
- }
211
 
212
- # Try primary model
213
- try:
214
- response = requests.post(HF_API_URL, headers=headers, json=payload, timeout=60)
215
- if response.status_code == 200:
216
- result = response.json()
217
- if isinstance(result, list) and len(result) > 0:
218
- return result[0].get("generated_text", "")
219
- except Exception as e:
220
- print(f"Primary model failed: {e}")
221
 
222
- # Try backup models
223
- for backup_url in BACKUP_MODELS:
224
- try:
225
- response = requests.post(backup_url, headers=headers, json=payload, timeout=60)
226
- if response.status_code == 200:
227
- result = response.json()
228
- if isinstance(result, list) and len(result) > 0:
229
- return result[0].get("generated_text", "")
230
- except:
231
- continue
232
 
233
- return None
234
-
235
-
236
- def get_bird_image(bird_name: str) -> str:
237
- """Get image URL for a bird species."""
238
- # Check exact match first
239
- if bird_name in BIRD_IMAGES:
240
- return BIRD_IMAGES[bird_name]
241
 
242
- # Check partial match
243
- bird_lower = bird_name.lower()
244
- for known_bird, url in BIRD_IMAGES.items():
245
- if known_bird.lower() in bird_lower or bird_lower in known_bird.lower():
246
- return url
247
 
248
- return DEFAULT_IMAGE
249
 
250
 
251
  # ================== LLM PROMPTS ==================
252
 
253
- AUDIO_SYSTEM_PROMPT = """You are an expert ornithologist specializing in bird identification from audio recordings.
254
- You have extensive knowledge of 10,000+ bird species worldwide, with particular expertise in Indian birds (1,300+ species).
255
 
256
- Your task is to identify bird species from audio feature analysis. Consider:
257
- 1. Frequency characteristics match known bird calls
258
- 2. Call pattern (melodic vs monotone, repetitive vs variable)
259
- 3. Syllable rate and duration
260
- 4. Geographic likelihood if location provided
261
- 5. Seasonal patterns if month provided
262
 
263
- IMPORTANT: You must identify ALL birds that could be present in the recording. Many recordings have multiple species calling.
 
 
 
 
264
 
265
- Respond in this EXACT JSON format:
266
  {
267
  "birds": [
268
  {
269
  "name": "Common Name",
270
  "scientific_name": "Genus species",
271
  "confidence": 85,
272
- "reasoning": "Why this bird matches the audio features",
273
- "call_description": "Description of this bird's typical call"
274
  }
275
  ],
276
- "analysis": "Overall analysis of the recording",
277
- "is_unusual": false,
278
- "unusual_reason": null
279
- }
280
-
281
- Include ALL birds with confidence >= 50%. This supports multi-bird detection."""
282
-
283
- IMAGE_SYSTEM_PROMPT = """You are an expert ornithologist specializing in bird identification from photographs.
284
- You have extensive knowledge of 10,000+ bird species worldwide, with particular expertise in Indian birds.
285
-
286
- Analyze the image description and identify the bird species. Consider:
287
- 1. Plumage colors and patterns
288
- 2. Bill shape and size
289
- 3. Body proportions
290
- 4. Distinctive field marks
291
- 5. Habitat clues in background
292
-
293
- Respond in this EXACT JSON format:
294
- {
295
- "birds": [
296
- {
297
- "name": "Common Name",
298
- "scientific_name": "Genus species",
299
- "confidence": 85,
300
- "reasoning": "Why this bird matches the visual features",
301
- "visual_description": "Key visual identification features"
302
- }
303
- ],
304
- "analysis": "Overall analysis of the image"
305
  }"""
306
 
307
- DESCRIPTION_SYSTEM_PROMPT = """You are an expert ornithologist helping identify birds from verbal descriptions.
308
- You have knowledge of 10,000+ bird species worldwide, especially Indian birds.
309
-
310
- Based on the user's description, identify the most likely bird species. Consider:
311
- 1. Physical features mentioned
312
- 2. Call/song descriptions
313
- 3. Behavior patterns
314
- 4. Habitat information
315
- 5. Geographic context
316
 
317
- Respond in this EXACT JSON format:
318
- {
319
- "birds": [
320
- {
321
- "name": "Common Name",
322
- "scientific_name": "Genus species",
323
- "confidence": 85,
324
- "reasoning": "Why this matches the description",
325
- "tips": "Additional ID tips for this species"
326
- }
327
- ],
328
- "analysis": "Overall interpretation of the description"
329
- }"""
330
-
331
-
332
- def preprocess_audio(audio_data: np.ndarray, sr: int) -> Tuple[np.ndarray, int]:
333
- """Preprocess audio for analysis."""
334
- if audio_data.dtype == np.int16:
335
- audio_data = audio_data.astype(np.float32) / 32768.0
336
- elif audio_data.dtype == np.int32:
337
- audio_data = audio_data.astype(np.float32) / 2147483648.0
338
- else:
339
- audio_data = audio_data.astype(np.float32)
340
-
341
- if len(audio_data.shape) > 1:
342
- audio_data = np.mean(audio_data, axis=1)
343
-
344
- if sr != SAMPLE_RATE:
345
- num_samples = int(len(audio_data) * SAMPLE_RATE / sr)
346
- audio_data = signal.resample(audio_data, num_samples)
347
- sr = SAMPLE_RATE
348
-
349
- audio_data = audio_data / (np.max(np.abs(audio_data)) + 1e-8)
350
-
351
- # Bandpass filter
352
- nyq = sr / 2
353
- low, high = 150 / nyq, min(15000 / nyq, 0.99)
354
- b, a = signal.butter(4, [low, high], btype='band')
355
- audio_data = signal.filtfilt(b, a, audio_data)
356
-
357
- return audio_data, sr
358
 
359
 
360
- def format_bird_results(llm_response: str, source: str = "audio") -> str:
361
  """Parse LLM response and format with images."""
 
 
 
362
  try:
363
- # Try to extract JSON from response
364
- json_start = llm_response.find('{')
365
- json_end = llm_response.rfind('}') + 1
366
- if json_start >= 0 and json_end > json_start:
367
- data = json.loads(llm_response[json_start:json_end])
368
  else:
369
- raise ValueError("No JSON found")
 
370
 
371
  birds = data.get("birds", [])
372
  analysis = data.get("analysis", "")
@@ -374,22 +386,19 @@ def format_bird_results(llm_response: str, source: str = "audio") -> str:
374
  if not birds:
375
  return f"### ❌ No birds identified\n\n{analysis}"
376
 
377
- output = "## 🐦 Birds Identified by AI\n\n"
378
- output += f"*Analysis: {analysis}*\n\n"
379
 
380
  for i, bird in enumerate(birds, 1):
381
  name = bird.get("name", "Unknown")
382
  scientific = bird.get("scientific_name", "")
383
- confidence = bird.get("confidence", 0)
384
- reasoning = bird.get("reasoning", "")
385
 
386
- # Get image
387
- image_url = get_bird_image(name)
388
 
389
- # Confidence badge
390
- if confidence >= 80:
391
  badge = "🟒 HIGH"
392
- elif confidence >= 60:
393
  badge = "🟑 MEDIUM"
394
  else:
395
  badge = "πŸ”΄ LOW"
@@ -397,194 +406,154 @@ def format_bird_results(llm_response: str, source: str = "audio") -> str:
397
  output += f"""
398
  ---
399
 
400
- ### {i}. **{name}** ({confidence}%) {badge}
401
 
402
- ![{name}]({image_url})
403
 
404
  **Scientific Name:** _{scientific}_
405
 
406
- **Why this bird:** {reasoning}
407
 
408
  """
409
- # Add call description for audio
410
- if source == "audio" and "call_description" in bird:
411
- output += f"**Typical Call:** {bird['call_description']}\n\n"
412
-
413
- # Add visual description for image
414
- if source == "image" and "visual_description" in bird:
415
- output += f"**Visual ID:** {bird['visual_description']}\n\n"
416
-
417
- # Add tips for description
418
- if source == "description" and "tips" in bird:
419
- output += f"**ID Tips:** {bird['tips']}\n\n"
420
-
421
- # Check for unusual sighting
422
- if data.get("is_unusual"):
423
- output += f"\n\n⚠️ **Unusual Sighting:** {data.get('unusual_reason', 'Rare or unexpected species')}\n"
424
 
425
  return output
426
 
427
- except Exception as e:
428
- # If parsing fails, return raw response
429
- return f"### πŸ€– AI Analysis\n\n{llm_response}\n\n*(Note: Could not parse structured response)*"
430
 
431
 
432
  # ================== IDENTIFICATION FUNCTIONS ==================
433
 
434
- def identify_from_audio_stream(audio, location: str = "", month: str = ""):
435
- """
436
- Stream bird identification from audio using LLM.
437
- This is the REAL zero-shot identification using LLM knowledge.
438
- """
439
  if audio is None:
440
- yield "### ⚠️ Please record or upload bird audio first!"
441
- return
442
 
443
- yield "### πŸ”„ Processing audio..."
 
444
 
445
  try:
446
  sr, audio_data = audio
447
  audio_data, sr = preprocess_audio(audio_data, sr)
448
 
449
- yield "### πŸ”„ Extracting audio features..."
450
- features = extract_audio_features(audio_data, sr)
451
-
452
- yield f"### πŸ”„ Analyzing with AI...\n\n**Features detected:**\n{features.to_description()}"
453
 
454
- # Build prompt with features
455
- prompt = f"""Identify the bird(s) in this recording based on these audio features:
456
 
457
  {features.to_description()}
458
-
459
  """
460
  if location:
461
- prompt += f"Location: {location}\n"
462
  if month:
463
- prompt += f"Month: {month}\n"
464
 
465
- prompt += "\nIdentify ALL birds that could be making these sounds. Include any bird with confidence >= 50%."
466
 
467
- yield "### πŸ”„ Consulting AI ornithologist (this may take 30-60 seconds)..."
468
 
469
- # Call LLM
470
- response = call_llm(prompt, AUDIO_SYSTEM_PROMPT)
471
 
472
  if response:
473
- result = format_bird_results(response, "audio")
474
- result += f"\n\n---\n\n### πŸ“Š Audio Features\n{features.to_description()}"
 
475
  yield result
476
  else:
477
- yield """### ⚠️ AI service temporarily unavailable
478
 
479
- The HuggingFace Inference API is currently busy. This is normal for free tier usage.
480
-
481
- **What you can try:**
482
- 1. Wait 30 seconds and try again
483
- 2. Try the Description tab (often faster)
484
- 3. Use a shorter audio clip
485
 
486
  **Your audio features:**
487
- """ + features.to_description()
 
 
 
 
 
 
488
 
489
  except Exception as e:
490
- yield f"### ❌ Error: {str(e)}\n\nPlease try again with a different recording."
491
 
492
 
493
- def identify_from_description_stream(description: str):
494
- """Stream bird identification from description using LLM."""
495
  if not description or len(description.strip()) < 5:
496
- yield "### ⚠️ Please enter a description (at least 5 characters)"
497
- return
498
 
499
- yield "### πŸ”„ Analyzing description with AI..."
 
500
 
501
  prompt = f"""Identify the bird(s) based on this description:
502
 
503
  {description}
504
 
505
- If multiple birds could match, list all with confidence >= 50%."""
506
-
507
- yield "### πŸ”„ Consulting AI ornithologist..."
508
 
509
- response = call_llm(prompt, DESCRIPTION_SYSTEM_PROMPT)
510
 
511
  if response:
512
- yield format_bird_results(response, "description")
 
 
513
  else:
514
- yield """### ⚠️ AI service temporarily unavailable
515
 
516
- Please try again in 30 seconds.
517
 
518
- **Tips for description:**
519
- - Mention colors (black, white, red, blue, green)
520
- - Describe the call (whistle, screech, chatter)
521
- - Note size (sparrow-sized, crow-sized)
522
- - Include habitat (garden, forest, water)
523
- - Add behavior (hops, flies in groups, perches high)"""
524
 
525
 
526
- def identify_from_image_stream(image):
527
- """Stream bird identification from image using LLM."""
528
  if image is None:
529
- yield "### ⚠️ Please upload or capture a bird image"
530
- return
531
 
532
- yield "### πŸ”„ Analyzing image..."
 
533
 
534
  try:
535
  if hasattr(image, 'numpy'):
536
- img_array = image.numpy()
537
  else:
538
- img_array = np.array(image)
539
 
540
- # Extract color information
541
  colors = []
542
- if len(img_array.shape) == 3 and img_array.shape[2] >= 3:
543
- avg_r = np.mean(img_array[:, :, 0])
544
- avg_g = np.mean(img_array[:, :, 1])
545
- avg_b = np.mean(img_array[:, :, 2])
546
-
547
- if avg_g > avg_r * 1.1 and avg_g > avg_b * 1.1:
548
- colors.append("green")
549
- if avg_b > avg_r * 1.1 and avg_b > avg_g:
550
- colors.append("blue")
551
- if avg_r > avg_g * 1.2 and avg_r > avg_b * 1.2:
552
- colors.append("red or brown")
553
- if avg_r > 180 and avg_g > 180 and avg_b > 180:
554
- colors.append("white")
555
- if avg_r < 80 and avg_g < 80 and avg_b < 80:
556
- colors.append("black")
557
- if avg_r > 150 and avg_g > 120 and avg_b < 100:
558
- colors.append("yellow or golden")
559
- if avg_r > 100 and avg_g > 80 and avg_b > 60 and avg_r < 180:
560
- colors.append("brown")
561
 
562
- color_desc = ", ".join(colors) if colors else "mixed colors"
563
 
564
- yield f"### πŸ”„ Detected colors: {color_desc}\n\nConsulting AI ornithologist..."
565
-
566
- prompt = f"""Identify the bird in this image.
567
 
 
 
568
  Detected dominant colors: {color_desc}
569
- Image dimensions: {img_array.shape[1]}x{img_array.shape[0]} pixels
570
 
571
- Based on the color analysis, what Indian bird species could this be?
572
- Consider common birds with these colors in their plumage."""
573
 
574
- response = call_llm(prompt, IMAGE_SYSTEM_PROMPT)
575
 
576
  if response:
577
- yield format_bird_results(response, "image")
 
 
 
578
  else:
579
- yield f"""### ⚠️ AI service temporarily unavailable
580
-
581
- **Detected colors:** {color_desc}
582
-
583
- Try the Description tab and describe:
584
- - The exact colors you see
585
- - Bill shape and color
586
- - Body size
587
- - Any distinctive markings"""
588
 
589
  except Exception as e:
590
  yield f"### ❌ Error: {str(e)}"
@@ -592,185 +561,143 @@ Try the Description tab and describe:
592
 
593
  # ================== GRADIO UI ==================
594
 
595
- with gr.Blocks(title="🐦 BirdSense Pro - AI Bird ID") as demo:
596
 
597
  gr.HTML("""
598
  <div style="text-align: center; background: linear-gradient(135deg, #1a4d2e 0%, #2d5a3e 50%, #1a4d2e 100%); padding: 2rem; border-radius: 16px; margin-bottom: 1.5rem;">
599
  <h1 style="color: #4ade80; font-size: 2.5rem; margin: 0;">🐦 BirdSense Pro</h1>
600
- <p style="color: #94a3b8; font-size: 1.2rem;">AI-Powered Bird Identification</p>
601
  <p style="color: #64748b; font-size: 0.9rem;">
602
- πŸ€– Uses LLM knowledge of <b>10,000+ species</b> β€’ NOT hardcoded!
603
- </p>
604
- <p style="color: #475569; font-size: 0.8rem;">
605
- Audio β€’ Image β€’ Description | Multi-bird detection | Streaming responses
606
  </p>
607
  </div>
608
  """)
609
 
 
 
 
 
610
  with gr.Tabs():
611
- # === AUDIO TAB ===
612
- with gr.Tab("🎀 Audio (LLM Analysis)"):
613
  gr.Markdown("""
614
- ### 🎀 Record or upload bird audio
615
-
616
- **How it works:**
617
- 1. We extract audio features (frequency, pattern, syllables)
618
- 2. These features are sent to an AI (LLM) that knows 10,000+ bird species
619
- 3. The AI identifies ALL matching birds (multi-bird detection)
620
-
621
- *This is TRUE zero-shot identification - not hardcoded!*
622
  """)
623
 
624
  with gr.Row():
625
  with gr.Column(scale=1):
626
- audio_input = gr.Audio(
627
- sources=["microphone", "upload"],
628
- type="numpy",
629
- label="🎀 Bird Audio"
630
- )
631
  with gr.Row():
632
- location = gr.Textbox(label="πŸ“ Location (optional)", placeholder="e.g., Western Ghats, Kerala")
633
- month = gr.Dropdown(
634
- label="πŸ“… Month (optional)",
635
- choices=["", "January", "February", "March", "April", "May", "June",
636
- "July", "August", "September", "October", "November", "December"]
 
637
  )
638
- audio_btn = gr.Button("πŸ” Identify Birds with AI", variant="primary", size="lg")
639
 
640
  with gr.Column(scale=2):
641
- audio_output = gr.Markdown(label="AI Results (streaming)")
642
 
643
- audio_btn.click(
644
- fn=identify_from_audio_stream,
645
- inputs=[audio_input, location, month],
646
- outputs=[audio_output]
647
- )
648
 
649
- # === IMAGE TAB ===
650
- with gr.Tab("πŸ“· Image (LLM Analysis)"):
651
  gr.Markdown("""
652
- ### πŸ“· Upload or capture a bird image
653
-
654
- **How it works:**
655
- 1. We analyze colors and patterns in the image
656
- 2. This information is sent to an AI for identification
657
- 3. The AI uses its knowledge of bird plumage to identify species
658
  """)
659
 
660
  with gr.Row():
661
  with gr.Column(scale=1):
662
- image_input = gr.Image(
663
- sources=["upload", "webcam"],
664
- type="numpy",
665
- label="πŸ“· Bird Image"
666
  )
667
- image_btn = gr.Button("πŸ” Identify Bird with AI", variant="primary", size="lg")
668
 
669
  with gr.Column(scale=2):
670
- image_output = gr.Markdown(label="AI Results")
671
 
672
- image_btn.click(
673
- fn=identify_from_image_stream,
674
- inputs=[image_input],
675
- outputs=[image_output]
676
- )
677
 
678
- # === DESCRIPTION TAB ===
679
- with gr.Tab("πŸ“ Description (LLM Analysis)"):
680
  gr.Markdown("""
681
- ### πŸ“ Describe the bird you saw or heard
682
-
683
- **This is the most reliable method!** The AI can understand natural language descriptions.
684
-
685
- Describe: colors, size, call/song, behavior, habitat, location
686
  """)
687
 
688
  with gr.Row():
689
  with gr.Column(scale=1):
690
- desc_input = gr.Textbox(
691
- label="Bird Description",
692
- placeholder="""Example descriptions:
693
-
694
- "Small green bird with red forehead, making a repetitive tuk-tuk sound like a hammer"
695
-
696
- "Black and white bird with a beautiful melodious song, often seen in gardens at dawn"
697
-
698
- "Large brown bird with chattering call, always in groups of 6-7"
699
-
700
- "Bright blue bird with orange breast, sitting near water"
701
- """,
702
- lines=6
703
- )
704
- desc_btn = gr.Button("πŸ” Identify Bird with AI", variant="primary", size="lg")
705
 
706
  with gr.Column(scale=2):
707
- desc_output = gr.Markdown(label="AI Results")
708
 
709
- desc_btn.click(
710
- fn=identify_from_description_stream,
711
- inputs=[desc_input],
712
- outputs=[desc_output]
713
- )
714
 
715
- # === HOW IT WORKS TAB ===
716
- with gr.Tab("ℹ️ How It Works"):
717
- gr.Markdown("""
718
- ## 🧠 How BirdSense Pro Works
719
-
720
- ### NOT Hardcoded!
721
-
722
- Unlike simple rule-based systems, BirdSense Pro uses a **Large Language Model (LLM)**
723
- that has learned about birds from millions of documents, scientific papers, and bird guides.
724
-
725
- The LLM knows:
726
- - **10,000+ bird species** worldwide
727
- - **1,300+ Indian bird species** in detail
728
- - Bird calls, songs, and vocalizations
729
- - Plumage patterns and colors
730
- - Habitat preferences
731
- - Seasonal patterns
732
- - Geographic distributions
733
-
734
- ### Pipeline
735
-
736
- ```
737
- Audio Recording
738
- ↓
739
- Feature Extraction (frequency, pattern, syllables)
740
- ↓
741
- Natural Language Description of Features
742
- ↓
743
- LLM Analysis (Mistral-7B via HuggingFace)
744
- ↓
745
- Bird Identification with Confidence Scores
746
- ```
747
-
748
- ### Multi-Bird Detection
749
-
750
- If your recording has multiple species calling, the AI will identify ALL of them!
751
-
752
- ### Limitations
753
-
754
- - Depends on HuggingFace Inference API (free tier has rate limits)
755
- - May take 30-60 seconds for response
756
- - Image analysis is based on color extraction + LLM (not a vision model)
757
-
758
- ### For Best Results
759
-
760
- 1. **Audio:** Clear recordings with minimal background noise
761
- 2. **Image:** Good lighting, bird clearly visible
762
- 3. **Description:** Be specific about colors, calls, and behavior
763
  """)
764
 
765
  gr.HTML("""
766
  <div style="text-align: center; padding: 1rem; margin-top: 1rem; border-top: 1px solid #334155;">
767
  <p style="color: #4ade80; font-weight: bold;">🐦 BirdSense Pro - CSCR Initiative</p>
768
- <p style="color: #94a3b8;">Powered by LLM (10,000+ species) β€’ NOT hardcoded</p>
769
  <p style="color: #64748b;">
770
- <a href="https://github.com/sohamzycus/eagv2/tree/master/birdsense" style="color: #4ade80;">GitHub</a>
771
  </p>
772
  </div>
773
  """)
774
 
 
775
  if __name__ == "__main__":
 
 
 
776
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
1
  """
2
  🐦 BirdSense Pro - AI Bird Identification
3
+ Uses LOCAL Ollama LLM for TRUE zero-shot identification
4
 
5
+ Supports:
6
+ - Ollama (local) - PRIMARY (fast, no limits)
7
+ - HuggingFace API - FALLBACK (for cloud deployment)
8
 
9
  Features:
10
+ 1. Audio β†’ LLM Analysis β†’ Bird ID (zero-shot, 10,000+ species)
11
  2. Image β†’ LLM Vision β†’ Bird ID
12
  3. Description β†’ LLM β†’ Bird ID
13
  4. Streaming responses
14
+ 5. Multi-bird detection
15
 
16
  CSCR Initiative
17
  """
 
25
  import json
26
  import os
27
  import requests
28
+ import time
29
 
30
  # ================== CONFIG ==================
31
  SAMPLE_RATE = 48000
32
+
33
+ # Ollama configuration (LOCAL - primary)
34
+ OLLAMA_URL = "http://localhost:11434"
35
+ OLLAMA_MODEL = "qwen2.5:3b" # Fast, good for bird ID
36
+
37
+ # HuggingFace API (FALLBACK - for cloud deployment)
38
  HF_API_URL = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.3"
 
 
 
 
 
39
 
40
+ # Bird images
41
  BIRD_IMAGES = {
42
  "Asian Koel": "https://upload.wikimedia.org/wikipedia/commons/thumb/7/78/Eudynamys_scolopaceus_-_Koel_male_-_Sukhna_Lake%2C_India.jpg/320px-Eudynamys_scolopaceus_-_Koel_male_-_Sukhna_Lake%2C_India.jpg",
43
  "Indian Cuckoo": "https://upload.wikimedia.org/wikipedia/commons/thumb/6/6b/Cuculus_micropterus.jpg/320px-Cuculus_micropterus.jpg",
 
54
  "Spotted Owlet": "https://upload.wikimedia.org/wikipedia/commons/thumb/9/9d/Spotted_Owlet_%28Athene_brama%29.jpg/320px-Spotted_Owlet_%28Athene_brama%29.jpg",
55
  "Rose-ringed Parakeet": "https://upload.wikimedia.org/wikipedia/commons/thumb/e/e8/Psittacula_krameri_-_male_-_Fuerteventura.jpg/320px-Psittacula_krameri_-_male_-_Fuerteventura.jpg",
56
  "Greater Coucal": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/d6/Greater_Coucal_%28Centropus_sinensis%29_in_Hyderabad%2C_AP_W_IMG_7544.jpg/320px-Greater_Coucal_%28Centropus_sinensis%29_in_Hyderabad%2C_AP_W_IMG_7544.jpg",
57
+ "Common Tailorbird": "https://upload.wikimedia.org/wikipedia/commons/thumb/e/ea/Common_Tailorbird_%28Orthotomus_sutorius%29_in_Kolkata_I_IMG_2859.jpg/320px-Common_Tailorbird_%28Orthotomus_sutorius%29_in_Kolkata_I_IMG_2859.jpg",
58
+ "Green Bee-eater": "https://upload.wikimedia.org/wikipedia/commons/thumb/b/b1/Merops_orientalis_%28Pune%2C_India%29.jpg/320px-Merops_orientalis_%28Pune%2C_India%29.jpg",
59
+ "Common Hawk-Cuckoo": "https://upload.wikimedia.org/wikipedia/commons/thumb/0/08/Hierococcyx_varius.jpg/320px-Hierococcyx_varius.jpg",
60
+ "Indian Robin": "https://upload.wikimedia.org/wikipedia/commons/thumb/6/6e/Indian_Robin_%28Saxicoloides_fulicatus%29_Male.jpg/320px-Indian_Robin_%28Saxicoloides_fulicatus%29_Male.jpg",
61
+ "Grey Francolin": "https://upload.wikimedia.org/wikipedia/commons/thumb/8/8c/Grey_francolin_%28Francolinus_pondicerianus%29.jpg/320px-Grey_francolin_%28Francolinus_pondicerianus%29.jpg",
62
  }
 
63
  DEFAULT_IMAGE = "https://upload.wikimedia.org/wikipedia/commons/thumb/4/45/Eopsaltria_australis_-_Mogo_Campground.jpg/320px-Eopsaltria_australis_-_Mogo_Campground.jpg"
64
 
65
 
66
+ # ================== OLLAMA CLIENT ==================
67
+
68
+ class OllamaClient:
69
+ """Client for local Ollama LLM."""
70
+
71
+ def __init__(self, base_url: str = OLLAMA_URL, model: str = OLLAMA_MODEL):
72
+ self.base_url = base_url
73
+ self.model = model
74
+ self._available = None
75
+
76
+ def is_available(self) -> bool:
77
+ """Check if Ollama is running."""
78
+ if self._available is not None:
79
+ return self._available
80
+ try:
81
+ resp = requests.get(f"{self.base_url}/api/tags", timeout=2)
82
+ self._available = resp.status_code == 200
83
+ return self._available
84
+ except:
85
+ self._available = False
86
+ return False
87
+
88
+ def generate(self, prompt: str, system: str = None, stream: bool = False) -> str:
89
+ """Generate response from Ollama."""
90
+ payload = {
91
+ "model": self.model,
92
+ "prompt": prompt,
93
+ "stream": stream,
94
+ "options": {
95
+ "temperature": 0.3,
96
+ "num_predict": 1500
97
+ }
98
+ }
99
+
100
+ if system:
101
+ payload["system"] = system
102
+
103
+ try:
104
+ if stream:
105
+ return self._generate_stream(payload)
106
+ else:
107
+ resp = requests.post(
108
+ f"{self.base_url}/api/generate",
109
+ json=payload,
110
+ timeout=120
111
+ )
112
+ if resp.status_code == 200:
113
+ return resp.json().get("response", "")
114
+ return None
115
+ except Exception as e:
116
+ print(f"Ollama error: {e}")
117
+ return None
118
+
119
+ def _generate_stream(self, payload) -> Generator[str, None, None]:
120
+ """Stream response from Ollama."""
121
+ try:
122
+ with requests.post(
123
+ f"{self.base_url}/api/generate",
124
+ json=payload,
125
+ stream=True,
126
+ timeout=120
127
+ ) as resp:
128
+ for line in resp.iter_lines():
129
+ if line:
130
+ data = json.loads(line)
131
+ if "response" in data:
132
+ yield data["response"]
133
+ if data.get("done"):
134
+ break
135
+ except Exception as e:
136
+ yield f"Error: {e}"
137
+
138
+
139
+ # Global Ollama client
140
+ ollama = OllamaClient()
141
+
142
+
143
+ def call_llm(prompt: str, system: str = None, stream: bool = False):
144
+ """
145
+ Call LLM - tries Ollama first (local), falls back to HuggingFace API.
146
+ """
147
+ # Try Ollama first (local, fast)
148
+ if ollama.is_available():
149
+ result = ollama.generate(prompt, system, stream=stream)
150
+ if result:
151
+ return result
152
+
153
+ # Fallback to HuggingFace API
154
+ try:
155
+ headers = {"Content-Type": "application/json"}
156
+ if system:
157
+ full_prompt = f"<s>[INST] {system}\n\n{prompt} [/INST]"
158
+ else:
159
+ full_prompt = f"<s>[INST] {prompt} [/INST]"
160
+
161
+ payload = {
162
+ "inputs": full_prompt,
163
+ "parameters": {
164
+ "max_new_tokens": 1500,
165
+ "temperature": 0.3,
166
+ "return_full_text": False
167
+ }
168
+ }
169
+
170
+ resp = requests.post(HF_API_URL, headers=headers, json=payload, timeout=90)
171
+ if resp.status_code == 200:
172
+ result = resp.json()
173
+ if isinstance(result, list) and len(result) > 0:
174
+ return result[0].get("generated_text", "")
175
+ except Exception as e:
176
+ print(f"HuggingFace API error: {e}")
177
+
178
+ return None
179
+
180
+
181
+ def get_llm_status() -> str:
182
+ """Get current LLM status."""
183
+ if ollama.is_available():
184
+ return f"🟒 Ollama ({OLLAMA_MODEL}) - LOCAL"
185
+ else:
186
+ return "🟑 HuggingFace API - CLOUD (slower)"
187
+
188
+
189
+ # ================== AUDIO FEATURES ==================
190
+
191
  @dataclass
192
  class AudioFeatures:
193
+ """Audio features for LLM analysis."""
194
  duration: float
195
  peak_frequency: float
196
  freq_range: Tuple[float, float]
 
203
  snr_db: float
204
 
205
  def to_description(self) -> str:
206
+ """Convert to natural language for LLM."""
207
+ freq_desc = self._describe_freq()
 
208
 
209
+ return f"""Audio analysis results:
210
  - Duration: {self.duration:.1f} seconds
211
  - Dominant frequency: {self.peak_frequency:.0f} Hz ({freq_desc})
212
  - Frequency range: {self.freq_range[0]:.0f} - {self.freq_range[1]:.0f} Hz
213
+ - Call pattern: {"melodic" if self.is_melodic else "monotone"}, {"repetitive" if self.is_repetitive else "variable"}
214
+ - Syllables: {self.num_syllables} detected ({self.syllable_rate:.1f}/second)
215
+ - Amplitude pattern: {self.amplitude_pattern}
216
+ - Recording quality: SNR {self.snr_db:.0f} dB ({"good" if self.snr_db > 15 else "fair" if self.snr_db > 8 else "poor"})"""
217
+
218
+ def _describe_freq(self) -> str:
219
  f = self.peak_frequency
220
+ if f < 500: return "very low - large bird like coucal, peacock, owl"
221
+ elif f < 1000: return "low - crow, dove, large bird"
222
+ elif f < 2000: return "low-medium - cuckoo, myna, babbler"
223
+ elif f < 4000: return "medium - most songbirds, bulbul, robin"
224
+ elif f < 6000: return "medium-high - warbler, tailorbird"
225
+ elif f < 8000: return "high - sunbird, small passerine"
226
+ else: return "very high - alarm call or insect-like"
 
 
 
 
 
 
 
 
 
 
 
 
227
 
228
 
229
+ def extract_features(audio: np.ndarray, sr: int) -> AudioFeatures:
230
+ """Extract audio features."""
231
  duration = len(audio) / sr
232
  audio = audio / (np.max(np.abs(audio)) + 1e-8)
233
 
234
+ # Spectral
235
  freqs, psd = signal.welch(audio, sr, nperseg=min(4096, len(audio)))
236
+ peak_freq = freqs[np.argmax(psd)]
 
 
237
  cumsum = np.cumsum(psd) / (np.sum(psd) + 1e-10)
238
  freq_low = freqs[np.searchsorted(cumsum, 0.10)]
239
  freq_high = freqs[np.searchsorted(cumsum, 0.90)]
240
+ centroid = np.sum(freqs * psd) / (np.sum(psd) + 1e-10)
241
 
242
+ # Envelope
 
 
243
  envelope = np.abs(signal.hilbert(audio))
244
+ k = int(0.02 * sr)
245
+ if k > 0:
246
+ envelope = gaussian_filter1d(envelope, k)
247
 
248
+ # Syllables
249
  n_fft, hop = 2048, 512
250
  _, _, Zxx = signal.stft(audio, sr, nperseg=n_fft, noverlap=n_fft-hop)
251
  flux = np.sum(np.maximum(0, np.diff(np.abs(Zxx), axis=1)), axis=0)
252
+ num_syl = 0
253
  if len(flux) > 0:
254
  flux = flux / (np.max(flux) + 1e-10)
255
+ th = np.mean(flux) + 0.5 * np.std(flux)
256
+ peaks, _ = signal.find_peaks(flux, height=th, distance=max(1, int(0.05*sr/hop)))
257
+ num_syl = len(peaks)
258
+ syl_rate = num_syl / duration if duration > 0 else 0
 
 
 
259
 
260
+ # Melodic
261
  is_melodic = False
262
  if len(audio) > sr:
263
  chunks = np.array_split(audio, min(20, max(5, int(duration*4))))
 
269
  if chunk_freqs:
270
  is_melodic = np.std(chunk_freqs) / (np.mean(chunk_freqs) + 1e-10) > 0.15
271
 
 
 
 
272
  # Amplitude pattern
273
+ amp_pattern = "unknown"
274
  if len(envelope) > 100:
275
  q = len(envelope) // 4
276
+ s, e = np.mean(envelope[:q]), np.mean(envelope[-q:])
277
+ v = np.std(envelope) / (np.mean(envelope) + 1e-10)
278
+ if v > 0.6: amp_pattern = "varied"
279
+ elif e > s * 1.3: amp_pattern = "ascending"
280
+ elif e < s * 0.7: amp_pattern = "descending"
281
  else: amp_pattern = "steady"
 
 
282
 
283
  # SNR
284
  noise = np.percentile(np.abs(audio), 5)
 
289
  duration=duration,
290
  peak_frequency=float(peak_freq),
291
  freq_range=(float(freq_low), float(freq_high)),
292
+ spectral_centroid=float(centroid),
293
+ num_syllables=num_syl,
294
+ syllable_rate=float(syl_rate),
295
  is_melodic=is_melodic,
296
+ is_repetitive=syl_rate > 3,
297
  amplitude_pattern=amp_pattern,
298
  snr_db=float(snr)
299
  )
300
 
301
 
302
+ def preprocess_audio(audio_data: np.ndarray, sr: int) -> Tuple[np.ndarray, int]:
303
+ """Preprocess audio."""
304
+ if audio_data.dtype == np.int16:
305
+ audio_data = audio_data.astype(np.float32) / 32768.0
306
+ elif audio_data.dtype == np.int32:
307
+ audio_data = audio_data.astype(np.float32) / 2147483648.0
 
 
 
 
308
  else:
309
+ audio_data = audio_data.astype(np.float32)
 
 
 
 
 
 
 
 
 
310
 
311
+ if len(audio_data.shape) > 1:
312
+ audio_data = np.mean(audio_data, axis=1)
 
 
 
 
 
 
 
313
 
314
+ if sr != SAMPLE_RATE:
315
+ num = int(len(audio_data) * SAMPLE_RATE / sr)
316
+ audio_data = signal.resample(audio_data, num)
317
+ sr = SAMPLE_RATE
 
 
 
 
 
 
318
 
319
+ audio_data = audio_data / (np.max(np.abs(audio_data)) + 1e-8)
 
 
 
 
 
 
 
320
 
321
+ # Bandpass
322
+ nyq = sr / 2
323
+ low, high = 150 / nyq, min(15000 / nyq, 0.99)
324
+ b, a = signal.butter(4, [low, high], btype='band')
325
+ audio_data = signal.filtfilt(b, a, audio_data)
326
 
327
+ return audio_data, sr
328
 
329
 
330
  # ================== LLM PROMPTS ==================
331
 
332
+ BIRD_EXPERT_SYSTEM = """You are an expert ornithologist with knowledge of 10,000+ bird species worldwide.
333
+ You specialize in Indian birds (1,300+ species).
334
 
335
+ Your task: Identify bird species from audio features, images, or descriptions.
 
 
 
 
 
336
 
337
+ IMPORTANT RULES:
338
+ 1. Identify ALL birds that could be present (multi-bird detection)
339
+ 2. Include any bird with confidence >= 50%
340
+ 3. Consider frequency, pattern, syllable rate, and context
341
+ 4. For India, consider common species first but don't ignore rare possibilities
342
 
343
+ You MUST respond in this EXACT JSON format:
344
  {
345
  "birds": [
346
  {
347
  "name": "Common Name",
348
  "scientific_name": "Genus species",
349
  "confidence": 85,
350
+ "reasoning": "Brief explanation of why this bird matches"
 
351
  }
352
  ],
353
+ "analysis": "Overall analysis of the recording/image/description"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
354
  }"""
355
 
 
 
 
 
 
 
 
 
 
356
 
357
+ def get_bird_image(name: str) -> str:
358
+ """Get image URL for bird."""
359
+ if name in BIRD_IMAGES:
360
+ return BIRD_IMAGES[name]
361
+ name_lower = name.lower()
362
+ for bird, url in BIRD_IMAGES.items():
363
+ if bird.lower() in name_lower or name_lower in bird.lower():
364
+ return url
365
+ return DEFAULT_IMAGE
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
366
 
367
 
368
+ def format_results(llm_response: str) -> str:
369
  """Parse LLM response and format with images."""
370
+ if not llm_response:
371
+ return "### ⚠️ No response from LLM"
372
+
373
  try:
374
+ # Extract JSON
375
+ start = llm_response.find('{')
376
+ end = llm_response.rfind('}') + 1
377
+ if start >= 0 and end > start:
378
+ data = json.loads(llm_response[start:end])
379
  else:
380
+ # Try to find birds mentioned in text
381
+ return f"### πŸ€– AI Analysis\n\n{llm_response}"
382
 
383
  birds = data.get("birds", [])
384
  analysis = data.get("analysis", "")
 
386
  if not birds:
387
  return f"### ❌ No birds identified\n\n{analysis}"
388
 
389
+ output = f"## 🐦 Birds Identified\n\n*{analysis}*\n\n"
 
390
 
391
  for i, bird in enumerate(birds, 1):
392
  name = bird.get("name", "Unknown")
393
  scientific = bird.get("scientific_name", "")
394
+ conf = bird.get("confidence", 0)
395
+ reason = bird.get("reasoning", "")
396
 
397
+ img = get_bird_image(name)
 
398
 
399
+ if conf >= 80:
 
400
  badge = "🟒 HIGH"
401
+ elif conf >= 60:
402
  badge = "🟑 MEDIUM"
403
  else:
404
  badge = "πŸ”΄ LOW"
 
406
  output += f"""
407
  ---
408
 
409
+ ### {i}. **{name}** ({conf}%) {badge}
410
 
411
+ ![{name}]({img})
412
 
413
  **Scientific Name:** _{scientific}_
414
 
415
+ **Why this bird:** {reason}
416
 
417
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
418
 
419
  return output
420
 
421
+ except json.JSONDecodeError:
422
+ return f"### πŸ€– AI Analysis\n\n{llm_response}"
 
423
 
424
 
425
  # ================== IDENTIFICATION FUNCTIONS ==================
426
 
427
+ def identify_audio(audio, location: str = "", month: str = ""):
428
+ """Identify bird from audio using LLM."""
 
 
 
429
  if audio is None:
430
+ return "### ⚠️ Please record or upload bird audio"
 
431
 
432
+ status = get_llm_status()
433
+ yield f"### πŸ”„ Processing audio...\n\n**LLM Status:** {status}"
434
 
435
  try:
436
  sr, audio_data = audio
437
  audio_data, sr = preprocess_audio(audio_data, sr)
438
 
439
+ yield f"### πŸ”„ Extracting features...\n\n**LLM Status:** {status}"
440
+ features = extract_features(audio_data, sr)
 
 
441
 
442
+ prompt = f"""Identify the bird(s) in this recording:
 
443
 
444
  {features.to_description()}
 
445
  """
446
  if location:
447
+ prompt += f"\nLocation: {location}"
448
  if month:
449
+ prompt += f"\nMonth: {month}"
450
 
451
+ prompt += "\n\nIdentify ALL birds that could be making these sounds (confidence >= 50%)."
452
 
453
+ yield f"### πŸ”„ Consulting AI ({status})...\n\n**Audio Features:**\n{features.to_description()}"
454
 
455
+ response = call_llm(prompt, BIRD_EXPERT_SYSTEM)
 
456
 
457
  if response:
458
+ result = format_results(response)
459
+ result += f"\n\n---\n\n### πŸ“Š Audio Analysis\n{features.to_description()}"
460
+ result += f"\n\n**LLM:** {status}"
461
  yield result
462
  else:
463
+ yield f"""### ⚠️ LLM not responding
464
 
465
+ **LLM Status:** {status}
 
 
 
 
 
466
 
467
  **Your audio features:**
468
+ {features.to_description()}
469
+
470
+ **To fix:**
471
+ 1. Make sure Ollama is running: `ollama serve`
472
+ 2. Pull the model: `ollama pull {OLLAMA_MODEL}`
473
+ 3. Try again
474
+ """
475
 
476
  except Exception as e:
477
+ yield f"### ❌ Error: {str(e)}"
478
 
479
 
480
+ def identify_description(description: str):
481
+ """Identify bird from description using LLM."""
482
  if not description or len(description.strip()) < 5:
483
+ return "### ⚠️ Please enter a description (at least 5 characters)"
 
484
 
485
+ status = get_llm_status()
486
+ yield f"### πŸ”„ Analyzing description...\n\n**LLM Status:** {status}"
487
 
488
  prompt = f"""Identify the bird(s) based on this description:
489
 
490
  {description}
491
 
492
+ Consider Indian birds especially. List all matching birds with confidence >= 50%."""
 
 
493
 
494
+ response = call_llm(prompt, BIRD_EXPERT_SYSTEM)
495
 
496
  if response:
497
+ result = format_results(response)
498
+ result += f"\n\n**LLM:** {status}"
499
+ yield result
500
  else:
501
+ yield f"""### ⚠️ LLM not responding
502
 
503
+ **LLM Status:** {status}
504
 
505
+ **To fix:**
506
+ 1. Make sure Ollama is running: `ollama serve`
507
+ 2. Pull the model: `ollama pull {OLLAMA_MODEL}`
508
+ """
 
 
509
 
510
 
511
+ def identify_image(image):
512
+ """Identify bird from image using LLM."""
513
  if image is None:
514
+ return "### ⚠️ Please upload or capture a bird image"
 
515
 
516
+ status = get_llm_status()
517
+ yield f"### πŸ”„ Analyzing image...\n\n**LLM Status:** {status}"
518
 
519
  try:
520
  if hasattr(image, 'numpy'):
521
+ img = image.numpy()
522
  else:
523
+ img = np.array(image)
524
 
525
+ # Color analysis
526
  colors = []
527
+ if len(img.shape) == 3 and img.shape[2] >= 3:
528
+ r, g, b = np.mean(img[:,:,0]), np.mean(img[:,:,1]), np.mean(img[:,:,2])
529
+ if g > r * 1.1 and g > b * 1.1: colors.append("green")
530
+ if b > r * 1.1 and b > g: colors.append("blue")
531
+ if r > g * 1.2 and r > b * 1.2: colors.append("red/brown")
532
+ if r > 180 and g > 180 and b > 180: colors.append("white")
533
+ if r < 80 and g < 80 and b < 80: colors.append("black")
534
+ if r > 150 and g > 120 and b < 100: colors.append("yellow")
 
 
 
 
 
 
 
 
 
 
 
535
 
536
+ color_desc = ", ".join(colors) if colors else "mixed"
537
 
538
+ yield f"### πŸ”„ Detected colors: {color_desc}\n\n**LLM Status:** {status}"
 
 
539
 
540
+ prompt = f"""Identify the bird in this image.
541
+
542
  Detected dominant colors: {color_desc}
543
+ Image size: {img.shape[1]}x{img.shape[0]} pixels
544
 
545
+ Based on these colors, what Indian bird species could this be?
546
+ List all matching birds with confidence >= 50%."""
547
 
548
+ response = call_llm(prompt, BIRD_EXPERT_SYSTEM)
549
 
550
  if response:
551
+ result = format_results(response)
552
+ result += f"\n\n**Detected colors:** {color_desc}"
553
+ result += f"\n\n**LLM:** {status}"
554
+ yield result
555
  else:
556
+ yield f"### ⚠️ LLM not responding\n\n**Detected colors:** {color_desc}"
 
 
 
 
 
 
 
 
557
 
558
  except Exception as e:
559
  yield f"### ❌ Error: {str(e)}"
 
561
 
562
  # ================== GRADIO UI ==================
563
 
564
+ with gr.Blocks(title="🐦 BirdSense Pro - Ollama LLM") as demo:
565
 
566
  gr.HTML("""
567
  <div style="text-align: center; background: linear-gradient(135deg, #1a4d2e 0%, #2d5a3e 50%, #1a4d2e 100%); padding: 2rem; border-radius: 16px; margin-bottom: 1.5rem;">
568
  <h1 style="color: #4ade80; font-size: 2.5rem; margin: 0;">🐦 BirdSense Pro</h1>
569
+ <p style="color: #94a3b8; font-size: 1.2rem;">Local LLM Bird Identification (Ollama)</p>
570
  <p style="color: #64748b; font-size: 0.9rem;">
571
+ πŸ€– Uses LOCAL Ollama LLM β€’ 10,000+ species β€’ Multi-bird detection
 
 
 
572
  </p>
573
  </div>
574
  """)
575
 
576
+ # LLM Status indicator
577
+ status_text = get_llm_status()
578
+ gr.Markdown(f"**Current LLM:** {status_text}")
579
+
580
  with gr.Tabs():
581
+ # AUDIO TAB
582
+ with gr.Tab("🎀 Audio"):
583
  gr.Markdown("""
584
+ ### Record or upload bird audio
585
+
586
+ The audio features are extracted and sent to the LLM (Ollama) which identifies ALL matching birds.
 
 
 
 
 
587
  """)
588
 
589
  with gr.Row():
590
  with gr.Column(scale=1):
591
+ audio_in = gr.Audio(sources=["microphone", "upload"], type="numpy", label="🎀 Bird Audio")
 
 
 
 
592
  with gr.Row():
593
+ loc_in = gr.Textbox(label="πŸ“ Location", placeholder="e.g., Western Ghats")
594
+ month_in = gr.Dropdown(
595
+ label="πŸ“… Month",
596
+ choices=["", "January", "February", "March", "April", "May",
597
+ "June", "July", "August", "September", "October",
598
+ "November", "December"]
599
  )
600
+ audio_btn = gr.Button("πŸ” Identify with Ollama LLM", variant="primary", size="lg")
601
 
602
  with gr.Column(scale=2):
603
+ audio_out = gr.Markdown()
604
 
605
+ audio_btn.click(identify_audio, [audio_in, loc_in, month_in], audio_out)
 
 
 
 
606
 
607
+ # DESCRIPTION TAB
608
+ with gr.Tab("πŸ“ Description"):
609
  gr.Markdown("""
610
+ ### Describe the bird you saw or heard
611
+
612
+ The LLM will analyze your description and identify matching species.
 
 
 
613
  """)
614
 
615
  with gr.Row():
616
  with gr.Column(scale=1):
617
+ desc_in = gr.Textbox(
618
+ label="Bird Description",
619
+ placeholder="Example: Small green bird with red forehead, making tuk-tuk-tuk sound like a hammer",
620
+ lines=4
621
  )
622
+ desc_btn = gr.Button("πŸ” Identify with Ollama LLM", variant="primary", size="lg")
623
 
624
  with gr.Column(scale=2):
625
+ desc_out = gr.Markdown()
626
 
627
+ desc_btn.click(identify_description, [desc_in], desc_out)
 
 
 
 
628
 
629
+ # IMAGE TAB
630
+ with gr.Tab("πŸ“· Image"):
631
  gr.Markdown("""
632
+ ### Upload or capture a bird image
633
+
634
+ Colors are extracted and sent to the LLM for identification.
 
 
635
  """)
636
 
637
  with gr.Row():
638
  with gr.Column(scale=1):
639
+ img_in = gr.Image(sources=["upload", "webcam"], type="numpy", label="πŸ“· Bird Image")
640
+ img_btn = gr.Button("πŸ” Identify with Ollama LLM", variant="primary", size="lg")
 
 
 
 
 
 
 
 
 
 
 
 
 
641
 
642
  with gr.Column(scale=2):
643
+ img_out = gr.Markdown()
644
 
645
+ img_btn.click(identify_image, [img_in], img_out)
 
 
 
 
646
 
647
+ # SETUP TAB
648
+ with gr.Tab("βš™οΈ Setup"):
649
+ gr.Markdown(f"""
650
+ ## Ollama Setup
651
+
652
+ BirdSense Pro uses **Ollama** for local LLM inference.
653
+
654
+ ### Current Status: {get_llm_status()}
655
+
656
+ ### Setup Instructions:
657
+
658
+ 1. **Install Ollama:**
659
+ ```bash
660
+ # macOS
661
+ brew install ollama
662
+
663
+ # Or download from https://ollama.ai
664
+ ```
665
+
666
+ 2. **Start Ollama:**
667
+ ```bash
668
+ ollama serve
669
+ ```
670
+
671
+ 3. **Pull the model:**
672
+ ```bash
673
+ ollama pull {OLLAMA_MODEL}
674
+ ```
675
+
676
+ 4. **Refresh this page and try again!**
677
+
678
+ ### Model Used: `{OLLAMA_MODEL}`
679
+
680
+ This is a fast, efficient model good for bird identification.
681
+ For better accuracy, you can also try:
682
+ - `llama3.2:3b`
683
+ - `mistral:7b`
684
+ - `qwen2.5:7b`
685
+
686
+ Change the model in the code: `OLLAMA_MODEL = "your-model"`
 
 
 
 
 
 
 
 
687
  """)
688
 
689
  gr.HTML("""
690
  <div style="text-align: center; padding: 1rem; margin-top: 1rem; border-top: 1px solid #334155;">
691
  <p style="color: #4ade80; font-weight: bold;">🐦 BirdSense Pro - CSCR Initiative</p>
 
692
  <p style="color: #64748b;">
693
+ Powered by LOCAL Ollama LLM β€’ <a href="https://github.com/sohamzycus/eagv2/tree/master/birdsense" style="color: #4ade80;">GitHub</a>
694
  </p>
695
  </div>
696
  """)
697
 
698
+
699
  if __name__ == "__main__":
700
+ print(f"\n🐦 BirdSense Pro")
701
+ print(f"LLM Status: {get_llm_status()}")
702
+ print(f"\nStarting server...")
703
  demo.launch(server_name="0.0.0.0", server_port=7860)