import gradio as gr import nltk import re from nltk.tokenize import sent_tokenize # Make sure NLTK data is downloaded correctly import os os.environ['NLTK_DATA'] = '/home/user/nltk_data' nltk.download('punkt', quiet=True, download_dir='/home/user/nltk_data') # Make sure the data is available try: nltk.data.find('tokenizers/punkt') print("NLTK punkt tokenizer found successfully!") except LookupError as e: print(f"Error finding punkt tokenizer: {e}") # Try a more explicit download nltk.download('punkt', download_dir='/home/user/nltk_data') print("Attempted explicit download of punkt") def count_tokens(text): """ Estimate the number of tokens in a text. This is a rough approximation based on counting words and punctuation. """ # Split on whitespace and keep punctuation as tokens try: words = re.findall(r'\b\w+\b|[.,!?;:]', text) return len(words) except Exception as e: print(f"Error counting tokens: {e}") # Fallback to a simpler method return len(text.split()) def segment_transcript(transcript, max_segment_length=1500, smart_boundaries=True): """ Segments a transcript into smaller chunks for processing. Args: transcript: The full transcript text max_segment_length: Maximum length of each segment in characters smart_boundaries: Whether to use sentence boundaries for smarter segmentation Returns: A list of segments """ if not transcript or transcript.strip() == "": return [] # Clean up the transcript by normalizing whitespace transcript = re.sub(r'\s+', ' ', transcript).strip() if smart_boundaries: try: # Use sentence tokenization for smarter segmentation sentences = sent_tokenize(transcript) print(f"Successfully tokenized transcript into {len(sentences)} sentences") except Exception as e: print(f"Error during sentence tokenization: {e}") print("Falling back to simple segmentation") # Fall back to simple segmentation smart_boundaries = False if smart_boundaries: segments = [] current_segment = "" current_token_count = 0 estimated_token_limit = max_segment_length # Characters as rough approximation for sentence in sentences: sentence_token_count = count_tokens(sentence) if current_token_count + sentence_token_count <= estimated_token_limit: current_segment += sentence + " " current_token_count += sentence_token_count else: if current_segment: segments.append(current_segment.strip()) current_segment = sentence + " " current_token_count = sentence_token_count if current_segment: # Add the last segment if it exists segments.append(current_segment.strip()) else: # Simple character-based chunking without respecting sentence boundaries segments = [] for i in range(0, len(transcript), max_segment_length): segments.append(transcript[i:i + max_segment_length]) # Add segment numbers for easy reference numbered_segments = [f"Segment {i+1}/{len(segments)}:\n{segment}" for i, segment in enumerate(segments)] print(f"Created {len(numbered_segments)} segments") return numbered_segments def process_transcript(transcript, max_length, use_smart_boundaries): """Main function that processes the transcript and returns results""" if not transcript or transcript.strip() == "": return "", "No segments created", {} segments = segment_transcript(transcript, max_length, use_smart_boundaries) # Create segment statistics stats = { "total_segments": len(segments), "total_characters": len(transcript), "average_segment_length": len(transcript) / max(1, len(segments)), "segments": [ {"id": i+1, "characters": len(segment), "estimated_tokens": count_tokens(segment)} for i, segment in enumerate(segments) ] } # Format the segments for display formatted_segments = "\n\n" + "\n\n".join(segments) return formatted_segments, f"{len(segments)} segments created", stats # Create the Gradio interface with gr.Blocks(title="Transcript Segmenter") as demo: gr.Markdown("# Transcript Segmenter") gr.Markdown(""" This tool segments long transcripts into smaller chunks that can be processed by LLM-based tools. It intelligently splits at sentence boundaries to maintain context. """) with gr.Row(): with gr.Column(): input_text = gr.Textbox( label="Full Transcript", placeholder="Paste your full transcript here...", lines=10 ) with gr.Row(): segment_length = gr.Slider( label="Maximum segment length (characters)", minimum=500, maximum=3000, value=1500, step=100 ) smart_boundaries = gr.Checkbox( label="Use smart sentence boundaries", value=True ) segment_btn = gr.Button("Segment Transcript") with gr.Column(): output_segments = gr.Textbox( label="Segmented Transcript", placeholder="Segmented transcript will appear here...", lines=15 ) segment_count = gr.Textbox(label="Number of Segments") segment_stats = gr.JSON(label="Segment Statistics") segment_btn.click( fn=process_transcript, inputs=[input_text, segment_length, smart_boundaries], outputs=[output_segments, segment_count, segment_stats] ) # API documentation section gr.Markdown(""" ## API Usage This tool can be called programmatically using the Gradio client: ```python from gradio_client import Client client = Client("https://your-space-name.hf.space") segments, count, stats = client.predict( "Your full transcript text here", # Input transcript 1500, # Max segment length True, # Use smart boundaries fn_index=0 ) ``` ## Tips for Best Results - The tool works best when transcripts have proper punctuation - Using "smart sentence boundaries" preserves coherent segments - For transcripts with poor sentence structure, you may want to disable smart boundaries - A segment length of 1500-2000 characters works well for most LLM-based analyzers """) # Launch the app - this is the standard way to launch Gradio apps if __name__ == "__main__": demo.launch()