Spaces:
Sleeping
Sleeping
Update app.py
#1
by
nonnan - opened
app.py
CHANGED
|
@@ -155,13 +155,76 @@ class HistoryManager:
|
|
| 155 |
|
| 156 |
# Core Analysis Engine
|
| 157 |
class SentimentEngine:
|
| 158 |
-
"""Streamlined sentiment analysis"""
|
| 159 |
def __init__(self):
|
| 160 |
self.model_manager = ModelManager()
|
| 161 |
|
| 162 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
def analyze_single(self, text: str) -> Dict:
|
| 164 |
-
"""Analyze single text"""
|
| 165 |
if not text.strip():
|
| 166 |
raise ValueError("Empty text")
|
| 167 |
|
|
@@ -175,16 +238,21 @@ class SentimentEngine:
|
|
| 175 |
probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()[0]
|
| 176 |
|
| 177 |
sentiment = "Positive" if probs[1] > probs[0] else "Negative"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
return {
|
| 179 |
'sentiment': sentiment,
|
| 180 |
'confidence': float(probs.max()),
|
| 181 |
'pos_prob': float(probs[1]),
|
| 182 |
-
'neg_prob': float(probs[0])
|
|
|
|
| 183 |
}
|
| 184 |
|
| 185 |
@handle_errors(default_return=[])
|
| 186 |
def analyze_batch(self, texts: List[str], progress_callback=None) -> List[Dict]:
|
| 187 |
-
"""Optimized batch processing"""
|
| 188 |
if len(texts) > config.BATCH_SIZE_LIMIT:
|
| 189 |
texts = texts[:config.BATCH_SIZE_LIMIT]
|
| 190 |
|
|
@@ -208,13 +276,17 @@ class SentimentEngine:
|
|
| 208 |
|
| 209 |
for text, prob in zip(batch, probs):
|
| 210 |
sentiment = "Positive" if prob[1] > prob[0] else "Negative"
|
|
|
|
|
|
|
|
|
|
| 211 |
results.append({
|
| 212 |
'text': text[:50] + '...' if len(text) > 50 else text,
|
| 213 |
'full_text': text,
|
| 214 |
'sentiment': sentiment,
|
| 215 |
'confidence': float(prob.max()),
|
| 216 |
'pos_prob': float(prob[1]),
|
| 217 |
-
'neg_prob': float(prob[0])
|
|
|
|
| 218 |
})
|
| 219 |
|
| 220 |
return results
|
|
@@ -275,6 +347,40 @@ class PlotFactory:
|
|
| 275 |
fig.tight_layout()
|
| 276 |
return fig
|
| 277 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 278 |
@staticmethod
|
| 279 |
@handle_errors(default_return=None)
|
| 280 |
def create_wordcloud(text: str, sentiment: str, theme: ThemeContext) -> Optional[plt.Figure]:
|
|
@@ -346,7 +452,7 @@ class DataHandler:
|
|
| 346 |
|
| 347 |
if format_type == 'csv':
|
| 348 |
writer = csv.writer(temp_file)
|
| 349 |
-
writer.writerow(['Timestamp', 'Text', 'Sentiment', 'Confidence', 'Pos_Prob', 'Neg_Prob'])
|
| 350 |
for entry in data:
|
| 351 |
writer.writerow([
|
| 352 |
entry.get('timestamp', ''),
|
|
@@ -354,7 +460,8 @@ class DataHandler:
|
|
| 354 |
entry.get('sentiment', ''),
|
| 355 |
f"{entry.get('confidence', 0):.4f}",
|
| 356 |
f"{entry.get('pos_prob', 0):.4f}",
|
| 357 |
-
f"{entry.get('neg_prob', 0):.4f}"
|
|
|
|
| 358 |
])
|
| 359 |
elif format_type == 'json':
|
| 360 |
json.dump(data, temp_file, indent=2, ensure_ascii=False)
|
|
@@ -394,18 +501,18 @@ class SentimentApp:
|
|
| 394 |
|
| 395 |
# Example data
|
| 396 |
self.examples = [
|
| 397 |
-
["
|
| 398 |
-
["
|
| 399 |
-
["
|
| 400 |
-
["
|
| 401 |
-
["
|
| 402 |
]
|
| 403 |
|
| 404 |
-
@handle_errors(default_return=("Please enter text", None, None, None))
|
| 405 |
def analyze_single(self, text: str, theme: str = 'default'):
|
| 406 |
-
"""Single text analysis"""
|
| 407 |
if not text.strip():
|
| 408 |
-
return "Please enter text", None, None, None
|
| 409 |
|
| 410 |
result = self.engine.analyze_single(text)
|
| 411 |
|
|
@@ -423,9 +530,14 @@ class SentimentApp:
|
|
| 423 |
prob_plot = PlotFactory.create_sentiment_bars(probs, theme_ctx)
|
| 424 |
gauge_plot = PlotFactory.create_confidence_gauge(result['confidence'], result['sentiment'], theme_ctx)
|
| 425 |
cloud_plot = PlotFactory.create_wordcloud(text, result['sentiment'], theme_ctx)
|
|
|
|
| 426 |
|
| 427 |
-
|
| 428 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 429 |
|
| 430 |
@handle_errors(default_return=None)
|
| 431 |
def analyze_batch(self, reviews: str, progress=None):
|
|
@@ -492,7 +604,7 @@ def create_interface():
|
|
| 492 |
|
| 493 |
with gr.Blocks(theme=gr.themes.Soft(), title="Movie Sentiment Analyzer") as demo:
|
| 494 |
gr.Markdown("# 🎬 AI Movie Sentiment Analyzer")
|
| 495 |
-
gr.Markdown("Optimized sentiment analysis with advanced visualizations")
|
| 496 |
|
| 497 |
with gr.Tab("Single Analysis"):
|
| 498 |
with gr.Row():
|
|
@@ -516,13 +628,15 @@ def create_interface():
|
|
| 516 |
)
|
| 517 |
|
| 518 |
with gr.Column():
|
| 519 |
-
result_output = gr.Textbox(label="Result", lines=
|
| 520 |
|
| 521 |
with gr.Row():
|
| 522 |
prob_plot = gr.Plot(label="Probabilities")
|
| 523 |
gauge_plot = gr.Plot(label="Confidence")
|
| 524 |
|
| 525 |
-
|
|
|
|
|
|
|
| 526 |
|
| 527 |
with gr.Tab("Batch Analysis"):
|
| 528 |
with gr.Row():
|
|
@@ -558,7 +672,7 @@ def create_interface():
|
|
| 558 |
analyze_btn.click(
|
| 559 |
app.analyze_single,
|
| 560 |
inputs=[text_input, theme_selector],
|
| 561 |
-
outputs=[result_output, prob_plot, gauge_plot, wordcloud_plot]
|
| 562 |
)
|
| 563 |
|
| 564 |
load_btn.click(app.data_handler.process_file, inputs=file_upload, outputs=batch_input)
|
|
|
|
| 155 |
|
| 156 |
# Core Analysis Engine
|
| 157 |
class SentimentEngine:
|
| 158 |
+
"""Streamlined sentiment analysis with attention-based keyword extraction"""
|
| 159 |
def __init__(self):
|
| 160 |
self.model_manager = ModelManager()
|
| 161 |
|
| 162 |
+
def extract_key_words(self, text: str, top_k: int = 10) -> List[Tuple[str, float]]:
|
| 163 |
+
"""Extract contributing words using BERT attention weights"""
|
| 164 |
+
try:
|
| 165 |
+
inputs = self.model_manager.tokenizer(
|
| 166 |
+
text, return_tensors="pt", padding=True,
|
| 167 |
+
truncation=True, max_length=config.MAX_TEXT_LENGTH
|
| 168 |
+
).to(self.model_manager.device)
|
| 169 |
+
|
| 170 |
+
# Get model outputs with attention weights
|
| 171 |
+
with torch.no_grad():
|
| 172 |
+
outputs = self.model_manager.model(**inputs, output_attentions=True)
|
| 173 |
+
attention = outputs.attentions # Tuple of attention tensors for each layer
|
| 174 |
+
|
| 175 |
+
# Use the last layer's attention, average over all heads
|
| 176 |
+
last_attention = attention[-1] # Shape: [batch_size, num_heads, seq_len, seq_len]
|
| 177 |
+
avg_attention = last_attention.mean(dim=1) # Average over heads: [batch_size, seq_len, seq_len]
|
| 178 |
+
|
| 179 |
+
# Focus on attention to [CLS] token (index 0) as it represents the whole sequence
|
| 180 |
+
cls_attention = avg_attention[0, 0, :] # Attention from CLS to all tokens
|
| 181 |
+
|
| 182 |
+
# Get tokens and their attention scores
|
| 183 |
+
tokens = self.model_manager.tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
|
| 184 |
+
attention_scores = cls_attention.cpu().numpy()
|
| 185 |
+
|
| 186 |
+
# Filter out special tokens and combine subword tokens
|
| 187 |
+
word_scores = {}
|
| 188 |
+
current_word = ""
|
| 189 |
+
current_score = 0.0
|
| 190 |
+
|
| 191 |
+
for i, (token, score) in enumerate(zip(tokens, attention_scores)):
|
| 192 |
+
if token in ['[CLS]', '[SEP]', '[PAD]']:
|
| 193 |
+
continue
|
| 194 |
+
|
| 195 |
+
if token.startswith('##'):
|
| 196 |
+
# Subword token, add to current word
|
| 197 |
+
current_word += token[2:]
|
| 198 |
+
current_score = max(current_score, score) # Take max attention
|
| 199 |
+
else:
|
| 200 |
+
# New word, save previous if exists
|
| 201 |
+
if current_word and len(current_word) >= config.MIN_WORD_LENGTH:
|
| 202 |
+
word_scores[current_word.lower()] = current_score
|
| 203 |
+
|
| 204 |
+
current_word = token
|
| 205 |
+
current_score = score
|
| 206 |
+
|
| 207 |
+
# Don't forget the last word
|
| 208 |
+
if current_word and len(current_word) >= config.MIN_WORD_LENGTH:
|
| 209 |
+
word_scores[current_word.lower()] = current_score
|
| 210 |
+
|
| 211 |
+
# Filter out stop words and sort by attention score
|
| 212 |
+
filtered_words = {
|
| 213 |
+
word: score for word, score in word_scores.items()
|
| 214 |
+
if word not in config.STOP_WORDS and len(word) >= config.MIN_WORD_LENGTH
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
# Sort by attention score and return top_k
|
| 218 |
+
sorted_words = sorted(filtered_words.items(), key=lambda x: x[1], reverse=True)
|
| 219 |
+
return sorted_words[:top_k]
|
| 220 |
+
|
| 221 |
+
except Exception as e:
|
| 222 |
+
logger.error(f"Key word extraction failed: {e}")
|
| 223 |
+
return []
|
| 224 |
+
|
| 225 |
+
@handle_errors(default_return={'sentiment': 'Unknown', 'confidence': 0.0, 'key_words': []})
|
| 226 |
def analyze_single(self, text: str) -> Dict:
|
| 227 |
+
"""Analyze single text with key word extraction"""
|
| 228 |
if not text.strip():
|
| 229 |
raise ValueError("Empty text")
|
| 230 |
|
|
|
|
| 238 |
probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()[0]
|
| 239 |
|
| 240 |
sentiment = "Positive" if probs[1] > probs[0] else "Negative"
|
| 241 |
+
|
| 242 |
+
# Extract key contributing words
|
| 243 |
+
key_words = self.extract_key_words(text)
|
| 244 |
+
|
| 245 |
return {
|
| 246 |
'sentiment': sentiment,
|
| 247 |
'confidence': float(probs.max()),
|
| 248 |
'pos_prob': float(probs[1]),
|
| 249 |
+
'neg_prob': float(probs[0]),
|
| 250 |
+
'key_words': key_words
|
| 251 |
}
|
| 252 |
|
| 253 |
@handle_errors(default_return=[])
|
| 254 |
def analyze_batch(self, texts: List[str], progress_callback=None) -> List[Dict]:
|
| 255 |
+
"""Optimized batch processing with key words"""
|
| 256 |
if len(texts) > config.BATCH_SIZE_LIMIT:
|
| 257 |
texts = texts[:config.BATCH_SIZE_LIMIT]
|
| 258 |
|
|
|
|
| 276 |
|
| 277 |
for text, prob in zip(batch, probs):
|
| 278 |
sentiment = "Positive" if prob[1] > prob[0] else "Negative"
|
| 279 |
+
# Extract key words for each text in batch
|
| 280 |
+
key_words = self.extract_key_words(text, top_k=5) # Fewer for batch processing
|
| 281 |
+
|
| 282 |
results.append({
|
| 283 |
'text': text[:50] + '...' if len(text) > 50 else text,
|
| 284 |
'full_text': text,
|
| 285 |
'sentiment': sentiment,
|
| 286 |
'confidence': float(prob.max()),
|
| 287 |
'pos_prob': float(prob[1]),
|
| 288 |
+
'neg_prob': float(prob[0]),
|
| 289 |
+
'key_words': key_words
|
| 290 |
})
|
| 291 |
|
| 292 |
return results
|
|
|
|
| 347 |
fig.tight_layout()
|
| 348 |
return fig
|
| 349 |
|
| 350 |
+
@staticmethod
|
| 351 |
+
@handle_errors(default_return=None)
|
| 352 |
+
def create_keyword_chart(key_words: List[Tuple[str, float]], sentiment: str, theme: ThemeContext) -> Optional[plt.Figure]:
|
| 353 |
+
"""Create horizontal bar chart for key contributing words"""
|
| 354 |
+
if not key_words:
|
| 355 |
+
return None
|
| 356 |
+
|
| 357 |
+
with managed_figure(figsize=config.FIGURE_SIZE_SINGLE) as fig:
|
| 358 |
+
ax = fig.add_subplot(111)
|
| 359 |
+
|
| 360 |
+
words = [word for word, score in key_words]
|
| 361 |
+
scores = [score for word, score in key_words]
|
| 362 |
+
|
| 363 |
+
# Choose color based on sentiment
|
| 364 |
+
color = theme.colors['pos'] if sentiment == 'Positive' else theme.colors['neg']
|
| 365 |
+
|
| 366 |
+
# Create horizontal bar chart
|
| 367 |
+
bars = ax.barh(range(len(words)), scores, color=color, alpha=0.7)
|
| 368 |
+
ax.set_yticks(range(len(words)))
|
| 369 |
+
ax.set_yticklabels(words)
|
| 370 |
+
ax.set_xlabel('Attention Weight')
|
| 371 |
+
ax.set_title(f'Top Contributing Words ({sentiment})', fontweight='bold')
|
| 372 |
+
|
| 373 |
+
# Add value labels on bars
|
| 374 |
+
for i, (bar, score) in enumerate(zip(bars, scores)):
|
| 375 |
+
ax.text(bar.get_width() + 0.001, bar.get_y() + bar.get_height()/2.,
|
| 376 |
+
f'{score:.3f}', ha='left', va='center', fontsize=9)
|
| 377 |
+
|
| 378 |
+
# Invert y-axis to show highest scoring word at top
|
| 379 |
+
ax.invert_yaxis()
|
| 380 |
+
ax.grid(axis='x', alpha=0.3)
|
| 381 |
+
fig.tight_layout()
|
| 382 |
+
return fig
|
| 383 |
+
|
| 384 |
@staticmethod
|
| 385 |
@handle_errors(default_return=None)
|
| 386 |
def create_wordcloud(text: str, sentiment: str, theme: ThemeContext) -> Optional[plt.Figure]:
|
|
|
|
| 452 |
|
| 453 |
if format_type == 'csv':
|
| 454 |
writer = csv.writer(temp_file)
|
| 455 |
+
writer.writerow(['Timestamp', 'Text', 'Sentiment', 'Confidence', 'Pos_Prob', 'Neg_Prob', 'Key_Words'])
|
| 456 |
for entry in data:
|
| 457 |
writer.writerow([
|
| 458 |
entry.get('timestamp', ''),
|
|
|
|
| 460 |
entry.get('sentiment', ''),
|
| 461 |
f"{entry.get('confidence', 0):.4f}",
|
| 462 |
f"{entry.get('pos_prob', 0):.4f}",
|
| 463 |
+
f"{entry.get('neg_prob', 0):.4f}",
|
| 464 |
+
"|".join([f"{word}:{score:.3f}" for word, score in entry.get('key_words', [])])
|
| 465 |
])
|
| 466 |
elif format_type == 'json':
|
| 467 |
json.dump(data, temp_file, indent=2, ensure_ascii=False)
|
|
|
|
| 501 |
|
| 502 |
# Example data
|
| 503 |
self.examples = [
|
| 504 |
+
["The cinematography was stunning but the plot was predictable and lacked depth."],
|
| 505 |
+
["A masterpiece! Powerful performances and unforgettable scenes throughout."],
|
| 506 |
+
["Boring from start to finish with terrible acting and weak plot development."],
|
| 507 |
+
["Impressive effects but the story was confusing and difficult to follow."],
|
| 508 |
+
["Absolutely incredible ending - one of the best films in recent years!"]
|
| 509 |
]
|
| 510 |
|
| 511 |
+
@handle_errors(default_return=("Please enter text", None, None, None, None))
|
| 512 |
def analyze_single(self, text: str, theme: str = 'default'):
|
| 513 |
+
"""Single text analysis with key words"""
|
| 514 |
if not text.strip():
|
| 515 |
+
return "Please enter text", None, None, None, None
|
| 516 |
|
| 517 |
result = self.engine.analyze_single(text)
|
| 518 |
|
|
|
|
| 530 |
prob_plot = PlotFactory.create_sentiment_bars(probs, theme_ctx)
|
| 531 |
gauge_plot = PlotFactory.create_confidence_gauge(result['confidence'], result['sentiment'], theme_ctx)
|
| 532 |
cloud_plot = PlotFactory.create_wordcloud(text, result['sentiment'], theme_ctx)
|
| 533 |
+
keyword_plot = PlotFactory.create_keyword_chart(result['key_words'], result['sentiment'], theme_ctx)
|
| 534 |
|
| 535 |
+
# Format result text with key words
|
| 536 |
+
key_words_str = ", ".join([f"{word}({score:.3f})" for word, score in result['key_words'][:5]])
|
| 537 |
+
result_text = (f"Sentiment: {result['sentiment']} (Confidence: {result['confidence']:.3f})\n"
|
| 538 |
+
f"Key Words: {key_words_str}")
|
| 539 |
+
|
| 540 |
+
return result_text, prob_plot, gauge_plot, cloud_plot, keyword_plot
|
| 541 |
|
| 542 |
@handle_errors(default_return=None)
|
| 543 |
def analyze_batch(self, reviews: str, progress=None):
|
|
|
|
| 604 |
|
| 605 |
with gr.Blocks(theme=gr.themes.Soft(), title="Movie Sentiment Analyzer") as demo:
|
| 606 |
gr.Markdown("# 🎬 AI Movie Sentiment Analyzer")
|
| 607 |
+
gr.Markdown("Optimized sentiment analysis with advanced visualizations and key word extraction")
|
| 608 |
|
| 609 |
with gr.Tab("Single Analysis"):
|
| 610 |
with gr.Row():
|
|
|
|
| 628 |
)
|
| 629 |
|
| 630 |
with gr.Column():
|
| 631 |
+
result_output = gr.Textbox(label="Result", lines=3)
|
| 632 |
|
| 633 |
with gr.Row():
|
| 634 |
prob_plot = gr.Plot(label="Probabilities")
|
| 635 |
gauge_plot = gr.Plot(label="Confidence")
|
| 636 |
|
| 637 |
+
with gr.Row():
|
| 638 |
+
wordcloud_plot = gr.Plot(label="Word Cloud")
|
| 639 |
+
keyword_plot = gr.Plot(label="Key Contributing Words")
|
| 640 |
|
| 641 |
with gr.Tab("Batch Analysis"):
|
| 642 |
with gr.Row():
|
|
|
|
| 672 |
analyze_btn.click(
|
| 673 |
app.analyze_single,
|
| 674 |
inputs=[text_input, theme_selector],
|
| 675 |
+
outputs=[result_output, prob_plot, gauge_plot, wordcloud_plot, keyword_plot]
|
| 676 |
)
|
| 677 |
|
| 678 |
load_btn.click(app.data_handler.process_file, inputs=file_upload, outputs=batch_input)
|