Spaces:

abdulshakur
/

YT-TranscriptSegmenter

Sleeping

App Files Files Community

abdulshakur commited on Mar 23, 2025

Commit

0ee929e

verified ·

1 Parent(s): cdc3c30

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -7

app.py CHANGED Viewed

@@ -3,11 +3,20 @@ import nltk
 import re
 from nltk.tokenize import sent_tokenize
-# Download necessary NLTK data
 try:
     nltk.data.find('tokenizers/punkt')
-except LookupError:
-    nltk.download('punkt', quiet=True)
 def count_tokens(text):
     """
@@ -15,8 +24,13 @@ def count_tokens(text):
     This is a rough approximation based on counting words and punctuation.
     """
     # Split on whitespace and keep punctuation as tokens
-    words = re.findall(r'\b\w+\b|[.,!?;:]', text)
-    return len(words)
 def segment_transcript(transcript, max_segment_length=1500, smart_boundaries=True):
     """
@@ -37,8 +51,17 @@ def segment_transcript(transcript, max_segment_length=1500, smart_boundaries=Tru
     transcript = re.sub(r'\s+', ' ', transcript).strip()
     if smart_boundaries:
-        # Use sentence tokenization for smarter segmentation
-        sentences = sent_tokenize(transcript)
         segments = []
         current_segment = ""
         current_token_count = 0
@@ -68,6 +91,7 @@ def segment_transcript(transcript, max_segment_length=1500, smart_boundaries=Tru
     numbered_segments = [f"Segment {i+1}/{len(segments)}:\n{segment}"
                          for i, segment in enumerate(segments)]
     return numbered_segments
 def process_transcript(transcript, max_length, use_smart_boundaries):

 import re
 from nltk.tokenize import sent_tokenize
+# Make sure NLTK data is downloaded correctly
+import os
+os.environ['NLTK_DATA'] = '/home/user/nltk_data'
+nltk.download('punkt', quiet=True, download_dir='/home/user/nltk_data')
+# Make sure the data is available
 try:
     nltk.data.find('tokenizers/punkt')
+    print("NLTK punkt tokenizer found successfully!")
+except LookupError as e:
+    print(f"Error finding punkt tokenizer: {e}")
+    # Try a more explicit download
+    nltk.download('punkt', download_dir='/home/user/nltk_data')
+    print("Attempted explicit download of punkt")
 def count_tokens(text):
     """
     This is a rough approximation based on counting words and punctuation.
     """
     # Split on whitespace and keep punctuation as tokens
+    try:
+        words = re.findall(r'\b\w+\b|[.,!?;:]', text)
+        return len(words)
+    except Exception as e:
+        print(f"Error counting tokens: {e}")
+        # Fallback to a simpler method
+        return len(text.split())
 def segment_transcript(transcript, max_segment_length=1500, smart_boundaries=True):
     """
     transcript = re.sub(r'\s+', ' ', transcript).strip()
     if smart_boundaries:
+        try:
+            # Use sentence tokenization for smarter segmentation
+            sentences = sent_tokenize(transcript)
+            print(f"Successfully tokenized transcript into {len(sentences)} sentences")
+        except Exception as e:
+            print(f"Error during sentence tokenization: {e}")
+            print("Falling back to simple segmentation")
+            # Fall back to simple segmentation
+            smart_boundaries = False
+    if smart_boundaries:
         segments = []
         current_segment = ""
         current_token_count = 0
     numbered_segments = [f"Segment {i+1}/{len(segments)}:\n{segment}"
                          for i, segment in enumerate(segments)]
+    print(f"Created {len(numbered_segments)} segments")
     return numbered_segments
 def process_transcript(transcript, max_length, use_smart_boundaries):