import gradio as gr
import nltk
import re
from nltk.tokenize import sent_tokenize

# Make sure NLTK data is downloaded correctly
import os
os.environ['NLTK_DATA'] = '/home/user/nltk_data'
nltk.download('punkt', quiet=True, download_dir='/home/user/nltk_data')

# Make sure the data is available
try:
    nltk.data.find('tokenizers/punkt')
    print("NLTK punkt tokenizer found successfully!")
except LookupError as e:
    print(f"Error finding punkt tokenizer: {e}")
    # Try a more explicit download
    nltk.download('punkt', download_dir='/home/user/nltk_data')
    print("Attempted explicit download of punkt")

def count_tokens(text):
    """
    Estimate the number of tokens in a text.
    This is a rough approximation based on counting words and punctuation.
    """
    # Split on whitespace and keep punctuation as tokens
    try:
        words = re.findall(r'\b\w+\b|[.,!?;:]', text)
        return len(words)
    except Exception as e:
        print(f"Error counting tokens: {e}")
        # Fallback to a simpler method
        return len(text.split())

def segment_transcript(transcript, max_segment_length=1500, smart_boundaries=True):
    """
    Segments a transcript into smaller chunks for processing.
    
    Args:
        transcript: The full transcript text
        max_segment_length: Maximum length of each segment in characters
        smart_boundaries: Whether to use sentence boundaries for smarter segmentation
    
    Returns:
        A list of segments
    """
    if not transcript or transcript.strip() == "":
        return []
    
    # Clean up the transcript by normalizing whitespace
    transcript = re.sub(r'\s+', ' ', transcript).strip()
    
    if smart_boundaries:
        try:
            # Use sentence tokenization for smarter segmentation
            sentences = sent_tokenize(transcript)
            print(f"Successfully tokenized transcript into {len(sentences)} sentences")
        except Exception as e:
            print(f"Error during sentence tokenization: {e}")
            print("Falling back to simple segmentation")
            # Fall back to simple segmentation
            smart_boundaries = False
    
    if smart_boundaries:
        segments = []
        current_segment = ""
        current_token_count = 0
        estimated_token_limit = max_segment_length  # Characters as rough approximation
        
        for sentence in sentences:
            sentence_token_count = count_tokens(sentence)
            
            if current_token_count + sentence_token_count <= estimated_token_limit:
                current_segment += sentence + " "
                current_token_count += sentence_token_count
            else:
                if current_segment:
                    segments.append(current_segment.strip())
                current_segment = sentence + " "
                current_token_count = sentence_token_count
        
        if current_segment:  # Add the last segment if it exists
            segments.append(current_segment.strip())
    else:
        # Simple character-based chunking without respecting sentence boundaries
        segments = []
        for i in range(0, len(transcript), max_segment_length):
            segments.append(transcript[i:i + max_segment_length])
    
    # Add segment numbers for easy reference
    numbered_segments = [f"Segment {i+1}/{len(segments)}:\n{segment}" 
                         for i, segment in enumerate(segments)]
    
    print(f"Created {len(numbered_segments)} segments")
    return numbered_segments

def process_transcript(transcript, max_length, use_smart_boundaries):
    """Main function that processes the transcript and returns results"""
    if not transcript or transcript.strip() == "":
        return "", "No segments created", {}
        
    segments = segment_transcript(transcript, max_length, use_smart_boundaries)
    
    # Create segment statistics
    stats = {
        "total_segments": len(segments),
        "total_characters": len(transcript),
        "average_segment_length": len(transcript) / max(1, len(segments)),
        "segments": [
            {"id": i+1, "characters": len(segment), "estimated_tokens": count_tokens(segment)}
            for i, segment in enumerate(segments)
        ]
    }
    
    # Format the segments for display
    formatted_segments = "\n\n" + "\n\n".join(segments)
    
    return formatted_segments, f"{len(segments)} segments created", stats

# Create the Gradio interface
with gr.Blocks(title="Transcript Segmenter") as demo:
    gr.Markdown("# Transcript Segmenter")
    gr.Markdown("""
    This tool segments long transcripts into smaller chunks that can be processed by LLM-based tools.
    It intelligently splits at sentence boundaries to maintain context.
    """)
    
    with gr.Row():
        with gr.Column():
            input_text = gr.Textbox(
                label="Full Transcript", 
                placeholder="Paste your full transcript here...",
                lines=10
            )
            
            with gr.Row():
                segment_length = gr.Slider(
                    label="Maximum segment length (characters)",
                    minimum=500,
                    maximum=3000,
                    value=1500,
                    step=100
                )
                
                smart_boundaries = gr.Checkbox(
                    label="Use smart sentence boundaries", 
                    value=True
                )
            
            segment_btn = gr.Button("Segment Transcript")
        
        with gr.Column():
            output_segments = gr.Textbox(
                label="Segmented Transcript",
                placeholder="Segmented transcript will appear here...",
                lines=15
            )
            segment_count = gr.Textbox(label="Number of Segments")
            segment_stats = gr.JSON(label="Segment Statistics")
    
    segment_btn.click(
        fn=process_transcript,
        inputs=[input_text, segment_length, smart_boundaries],
        outputs=[output_segments, segment_count, segment_stats]
    )
    
    # API documentation section
    gr.Markdown("""
    ## API Usage
    
    This tool can be called programmatically using the Gradio client:
    
    ```python
    from gradio_client import Client
    
    client = Client("https://your-space-name.hf.space")
    segments, count, stats = client.predict(
        "Your full transcript text here",  # Input transcript
        1500,                              # Max segment length
        True,                              # Use smart boundaries
        fn_index=0
    )
    ```
    
    ## Tips for Best Results
    
    - The tool works best when transcripts have proper punctuation
    - Using "smart sentence boundaries" preserves coherent segments
    - For transcripts with poor sentence structure, you may want to disable smart boundaries
    - A segment length of 1500-2000 characters works well for most LLM-based analyzers
    """)

# Launch the app - this is the standard way to launch Gradio apps
if __name__ == "__main__":
    demo.launch()