Spaces:

Abhishek279
/

Video_Transcription_Analysis

Running

App Files Files Community

Abhishek279 commited on 2 days ago

Commit

0592e7f

verified ·

1 Parent(s): f78efaa

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +24 -27

src/streamlit_app.py CHANGED Viewed

@@ -1104,11 +1104,12 @@ if language_option == "Specify Language":
     )
     selected_language = language_codes[selected_language_name]
-# Translation option: always output English text (for Hindi and other languages)
 translate_to_english = st.sidebar.checkbox(
-    "Translate non-English speech to English text",
     value=True,
-    help="When enabled, Whisper will output English text even if the speakers are talking in Hindi or other languages."
 )
 # Load models
@@ -1208,23 +1209,25 @@ if uploaded_file is not None:
                 transcribe_options = {
                     "verbose": False,  # Set to True for debugging
                     "fp16": False,  # Use FP32 for CPU compatibility
-                    "condition_on_previous_text": False,  # Disable to avoid KV cache tensor size issues
                     "compression_ratio_threshold": 2.4,  # Prevent early stopping
                     "logprob_threshold": -1.0,  # Lower threshold for better detection
-                    "no_speech_threshold": 0.3,  # Lower threshold to catch more speech (default is 0.6)
                     "temperature": 0.0,  # Deterministic output
                     "best_of": 1,  # Use single best result
-                    "beam_size": 1,  # Use greedy decoding (beam_size=1) to avoid KV cache tensor mismatch errors
                 }
-                # Language and translation handling
                 if translate_to_english:
-                    # Ask Whisper to translate any non-English speech into English text
                     transcribe_options["task"] = "translate"
-                    # If the user explicitly selected a source language (e.g. Hindi), pass it as the input language
-                    if selected_language and selected_language != "en":
                         transcribe_options["language"] = selected_language
                 else:
-                    # Normal transcription in the spoken language
                     if selected_language:
                         transcribe_options["language"] = selected_language
@@ -1237,11 +1240,6 @@ if uploaded_file is not None:
                 # Verify we got full transcription
                 total_segments = len(result.get("segments", []))
                 transcription_text = result.get("text", "").strip()
-                # If main text is empty but we have segments, reconstruct from segments
-                if not transcription_text and total_segments > 0:
-                    transcription_text = " ".join([seg.get("text", "").strip() for seg in result.get("segments", []) if seg.get("text", "").strip()])
                 transcription_length = len(transcription_text)
                 if total_segments == 0:
@@ -1376,16 +1374,8 @@ if uploaded_file is not None:
                 # Display results
                 # Main transcription text
                 st.subheader("📝 Transcription")
-                # Extract transcription text - try multiple methods
                 transcription_text = result.get("text", "").strip()
-                # If main text is empty but we have segments, reconstruct from segments
-                if not transcription_text:
-                    segments = result.get("segments", [])
-                    if segments:
-                        transcription_text = " ".join([seg.get("text", "").strip() for seg in segments if seg.get("text", "").strip()])
                 # Show transcription statistics
                 total_segments = len(result.get("segments", []))
                 if total_segments > 0:
@@ -1788,10 +1778,17 @@ if uploaded_file is not None:
                         "ur": "Urdu"
                     }
                     language_display = language_names.get(detected_language, detected_language.upper())
-                    if selected_language:
-                        st.info(f"Specified Language: {language_names.get(selected_language, selected_language.upper())} | Detected: {language_display}")
                     else:
-                        st.info(f"Detected Language: {language_display}")
                     if aligned_segments:
                         unique_speakers = set(seg["speaker"] for seg in aligned_segments)

     )
     selected_language = language_codes[selected_language_name]
+# Translation option
+st.sidebar.markdown("---")
 translate_to_english = st.sidebar.checkbox(
+    "🌍 Translate to English",
     value=True,
+    help="Convert any language (Hindi, Spanish, etc.) to English text. Recommended for Hindi videos."
 )
 # Load models
                 transcribe_options = {
                     "verbose": False,  # Set to True for debugging
                     "fp16": False,  # Use FP32 for CPU compatibility
+                    "condition_on_previous_text": True,  # Better context
                     "compression_ratio_threshold": 2.4,  # Prevent early stopping
                     "logprob_threshold": -1.0,  # Lower threshold for better detection
+                    "no_speech_threshold": 0.3,  # Much lower threshold to catch more speech (default is 0.6)
                     "temperature": 0.0,  # Deterministic output
                     "best_of": 1,  # Use single best result
+                    "beam_size": 5,  # Beam search size for better accuracy
                 }
+                # Handle translation and language options
                 if translate_to_english:
+                    # TRANSLATE mode: convert any language to English
                     transcribe_options["task"] = "translate"
+                    # Optionally specify source language for better accuracy
+                    if selected_language:
                         transcribe_options["language"] = selected_language
+                    # Note: Output will always be English regardless of input language
                 else:
+                    # TRANSCRIBE mode: output in the same language as input
                     if selected_language:
                         transcribe_options["language"] = selected_language
                 # Verify we got full transcription
                 total_segments = len(result.get("segments", []))
                 transcription_text = result.get("text", "").strip()
                 transcription_length = len(transcription_text)
                 if total_segments == 0:
                 # Display results
                 # Main transcription text
                 st.subheader("📝 Transcription")
                 transcription_text = result.get("text", "").strip()
                 # Show transcription statistics
                 total_segments = len(result.get("segments", []))
                 if total_segments > 0:
                         "ur": "Urdu"
                     }
                     language_display = language_names.get(detected_language, detected_language.upper())
+                    if translate_to_english:
+                        if selected_language:
+                            st.info(f"🌍 Source: {language_names.get(selected_language, selected_language.upper())} (Detected: {language_display}) → Translated to English ✓")
+                        else:
+                            st.info(f"🌍 Detected Language: {language_display} → Translated to English ✓")
                     else:
+                        if selected_language:
+                            st.info(f"Specified Language: {language_names.get(selected_language, selected_language.upper())} | Detected: {language_display}")
+                        else:
+                            st.info(f"Detected Language: {language_display}")
                     if aligned_segments:
                         unique_speakers = set(seg["speaker"] for seg in aligned_segments)