Abhishek279 commited on
Commit
0592e7f
·
verified ·
1 Parent(s): f78efaa

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +24 -27
src/streamlit_app.py CHANGED
@@ -1104,11 +1104,12 @@ if language_option == "Specify Language":
1104
  )
1105
  selected_language = language_codes[selected_language_name]
1106
 
1107
- # Translation option: always output English text (for Hindi and other languages)
 
1108
  translate_to_english = st.sidebar.checkbox(
1109
- "Translate non-English speech to English text",
1110
  value=True,
1111
- help="When enabled, Whisper will output English text even if the speakers are talking in Hindi or other languages."
1112
  )
1113
 
1114
  # Load models
@@ -1208,23 +1209,25 @@ if uploaded_file is not None:
1208
  transcribe_options = {
1209
  "verbose": False, # Set to True for debugging
1210
  "fp16": False, # Use FP32 for CPU compatibility
1211
- "condition_on_previous_text": False, # Disable to avoid KV cache tensor size issues
1212
  "compression_ratio_threshold": 2.4, # Prevent early stopping
1213
  "logprob_threshold": -1.0, # Lower threshold for better detection
1214
- "no_speech_threshold": 0.3, # Lower threshold to catch more speech (default is 0.6)
1215
  "temperature": 0.0, # Deterministic output
1216
  "best_of": 1, # Use single best result
1217
- "beam_size": 1, # Use greedy decoding (beam_size=1) to avoid KV cache tensor mismatch errors
1218
  }
1219
- # Language and translation handling
 
1220
  if translate_to_english:
1221
- # Ask Whisper to translate any non-English speech into English text
1222
  transcribe_options["task"] = "translate"
1223
- # If the user explicitly selected a source language (e.g. Hindi), pass it as the input language
1224
- if selected_language and selected_language != "en":
1225
  transcribe_options["language"] = selected_language
 
1226
  else:
1227
- # Normal transcription in the spoken language
1228
  if selected_language:
1229
  transcribe_options["language"] = selected_language
1230
 
@@ -1237,11 +1240,6 @@ if uploaded_file is not None:
1237
  # Verify we got full transcription
1238
  total_segments = len(result.get("segments", []))
1239
  transcription_text = result.get("text", "").strip()
1240
-
1241
- # If main text is empty but we have segments, reconstruct from segments
1242
- if not transcription_text and total_segments > 0:
1243
- transcription_text = " ".join([seg.get("text", "").strip() for seg in result.get("segments", []) if seg.get("text", "").strip()])
1244
-
1245
  transcription_length = len(transcription_text)
1246
 
1247
  if total_segments == 0:
@@ -1376,16 +1374,8 @@ if uploaded_file is not None:
1376
  # Display results
1377
  # Main transcription text
1378
  st.subheader("📝 Transcription")
1379
-
1380
- # Extract transcription text - try multiple methods
1381
  transcription_text = result.get("text", "").strip()
1382
 
1383
- # If main text is empty but we have segments, reconstruct from segments
1384
- if not transcription_text:
1385
- segments = result.get("segments", [])
1386
- if segments:
1387
- transcription_text = " ".join([seg.get("text", "").strip() for seg in segments if seg.get("text", "").strip()])
1388
-
1389
  # Show transcription statistics
1390
  total_segments = len(result.get("segments", []))
1391
  if total_segments > 0:
@@ -1788,10 +1778,17 @@ if uploaded_file is not None:
1788
  "ur": "Urdu"
1789
  }
1790
  language_display = language_names.get(detected_language, detected_language.upper())
1791
- if selected_language:
1792
- st.info(f"Specified Language: {language_names.get(selected_language, selected_language.upper())} | Detected: {language_display}")
 
 
 
 
1793
  else:
1794
- st.info(f"Detected Language: {language_display}")
 
 
 
1795
 
1796
  if aligned_segments:
1797
  unique_speakers = set(seg["speaker"] for seg in aligned_segments)
 
1104
  )
1105
  selected_language = language_codes[selected_language_name]
1106
 
1107
+ # Translation option
1108
+ st.sidebar.markdown("---")
1109
  translate_to_english = st.sidebar.checkbox(
1110
+ "🌍 Translate to English",
1111
  value=True,
1112
+ help="Convert any language (Hindi, Spanish, etc.) to English text. Recommended for Hindi videos."
1113
  )
1114
 
1115
  # Load models
 
1209
  transcribe_options = {
1210
  "verbose": False, # Set to True for debugging
1211
  "fp16": False, # Use FP32 for CPU compatibility
1212
+ "condition_on_previous_text": True, # Better context
1213
  "compression_ratio_threshold": 2.4, # Prevent early stopping
1214
  "logprob_threshold": -1.0, # Lower threshold for better detection
1215
+ "no_speech_threshold": 0.3, # Much lower threshold to catch more speech (default is 0.6)
1216
  "temperature": 0.0, # Deterministic output
1217
  "best_of": 1, # Use single best result
1218
+ "beam_size": 5, # Beam search size for better accuracy
1219
  }
1220
+
1221
+ # Handle translation and language options
1222
  if translate_to_english:
1223
+ # TRANSLATE mode: convert any language to English
1224
  transcribe_options["task"] = "translate"
1225
+ # Optionally specify source language for better accuracy
1226
+ if selected_language:
1227
  transcribe_options["language"] = selected_language
1228
+ # Note: Output will always be English regardless of input language
1229
  else:
1230
+ # TRANSCRIBE mode: output in the same language as input
1231
  if selected_language:
1232
  transcribe_options["language"] = selected_language
1233
 
 
1240
  # Verify we got full transcription
1241
  total_segments = len(result.get("segments", []))
1242
  transcription_text = result.get("text", "").strip()
 
 
 
 
 
1243
  transcription_length = len(transcription_text)
1244
 
1245
  if total_segments == 0:
 
1374
  # Display results
1375
  # Main transcription text
1376
  st.subheader("📝 Transcription")
 
 
1377
  transcription_text = result.get("text", "").strip()
1378
 
 
 
 
 
 
 
1379
  # Show transcription statistics
1380
  total_segments = len(result.get("segments", []))
1381
  if total_segments > 0:
 
1778
  "ur": "Urdu"
1779
  }
1780
  language_display = language_names.get(detected_language, detected_language.upper())
1781
+
1782
+ if translate_to_english:
1783
+ if selected_language:
1784
+ st.info(f"🌍 Source: {language_names.get(selected_language, selected_language.upper())} (Detected: {language_display}) → Translated to English ✓")
1785
+ else:
1786
+ st.info(f"🌍 Detected Language: {language_display} → Translated to English ✓")
1787
  else:
1788
+ if selected_language:
1789
+ st.info(f"Specified Language: {language_names.get(selected_language, selected_language.upper())} | Detected: {language_display}")
1790
+ else:
1791
+ st.info(f"Detected Language: {language_display}")
1792
 
1793
  if aligned_segments:
1794
  unique_speakers = set(seg["speaker"] for seg in aligned_segments)