abdulshakur commited on
Commit
0ee929e
·
verified ·
1 Parent(s): cdc3c30

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -7
app.py CHANGED
@@ -3,11 +3,20 @@ import nltk
3
  import re
4
  from nltk.tokenize import sent_tokenize
5
 
6
- # Download necessary NLTK data
 
 
 
 
 
7
  try:
8
  nltk.data.find('tokenizers/punkt')
9
- except LookupError:
10
- nltk.download('punkt', quiet=True)
 
 
 
 
11
 
12
  def count_tokens(text):
13
  """
@@ -15,8 +24,13 @@ def count_tokens(text):
15
  This is a rough approximation based on counting words and punctuation.
16
  """
17
  # Split on whitespace and keep punctuation as tokens
18
- words = re.findall(r'\b\w+\b|[.,!?;:]', text)
19
- return len(words)
 
 
 
 
 
20
 
21
  def segment_transcript(transcript, max_segment_length=1500, smart_boundaries=True):
22
  """
@@ -37,8 +51,17 @@ def segment_transcript(transcript, max_segment_length=1500, smart_boundaries=Tru
37
  transcript = re.sub(r'\s+', ' ', transcript).strip()
38
 
39
  if smart_boundaries:
40
- # Use sentence tokenization for smarter segmentation
41
- sentences = sent_tokenize(transcript)
 
 
 
 
 
 
 
 
 
42
  segments = []
43
  current_segment = ""
44
  current_token_count = 0
@@ -68,6 +91,7 @@ def segment_transcript(transcript, max_segment_length=1500, smart_boundaries=Tru
68
  numbered_segments = [f"Segment {i+1}/{len(segments)}:\n{segment}"
69
  for i, segment in enumerate(segments)]
70
 
 
71
  return numbered_segments
72
 
73
  def process_transcript(transcript, max_length, use_smart_boundaries):
 
3
  import re
4
  from nltk.tokenize import sent_tokenize
5
 
6
+ # Make sure NLTK data is downloaded correctly
7
+ import os
8
+ os.environ['NLTK_DATA'] = '/home/user/nltk_data'
9
+ nltk.download('punkt', quiet=True, download_dir='/home/user/nltk_data')
10
+
11
+ # Make sure the data is available
12
  try:
13
  nltk.data.find('tokenizers/punkt')
14
+ print("NLTK punkt tokenizer found successfully!")
15
+ except LookupError as e:
16
+ print(f"Error finding punkt tokenizer: {e}")
17
+ # Try a more explicit download
18
+ nltk.download('punkt', download_dir='/home/user/nltk_data')
19
+ print("Attempted explicit download of punkt")
20
 
21
  def count_tokens(text):
22
  """
 
24
  This is a rough approximation based on counting words and punctuation.
25
  """
26
  # Split on whitespace and keep punctuation as tokens
27
+ try:
28
+ words = re.findall(r'\b\w+\b|[.,!?;:]', text)
29
+ return len(words)
30
+ except Exception as e:
31
+ print(f"Error counting tokens: {e}")
32
+ # Fallback to a simpler method
33
+ return len(text.split())
34
 
35
  def segment_transcript(transcript, max_segment_length=1500, smart_boundaries=True):
36
  """
 
51
  transcript = re.sub(r'\s+', ' ', transcript).strip()
52
 
53
  if smart_boundaries:
54
+ try:
55
+ # Use sentence tokenization for smarter segmentation
56
+ sentences = sent_tokenize(transcript)
57
+ print(f"Successfully tokenized transcript into {len(sentences)} sentences")
58
+ except Exception as e:
59
+ print(f"Error during sentence tokenization: {e}")
60
+ print("Falling back to simple segmentation")
61
+ # Fall back to simple segmentation
62
+ smart_boundaries = False
63
+
64
+ if smart_boundaries:
65
  segments = []
66
  current_segment = ""
67
  current_token_count = 0
 
91
  numbered_segments = [f"Segment {i+1}/{len(segments)}:\n{segment}"
92
  for i, segment in enumerate(segments)]
93
 
94
+ print(f"Created {len(numbered_segments)} segments")
95
  return numbered_segments
96
 
97
  def process_transcript(transcript, max_length, use_smart_boundaries):