Alamgirapi commited on
Commit
1ea5800
·
verified ·
1 Parent(s): f0c3f60

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +230 -456
app.py CHANGED
@@ -8,75 +8,14 @@ from NoCodeTextClassifier.preprocessing import process, TextCleaner, Vectorizati
8
  from NoCodeTextClassifier.models import Models
9
  import os
10
  import pickle
11
- import hashlib
12
- import hmac
13
  from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
14
 
15
- # Authentication Configuration
16
- USERS = {
17
- "admin": "admin123",
18
- "user1": "password123",
19
- "demo": "demo123"
20
- }
21
-
22
- def check_password():
23
- """Returns True if the user has correct password."""
24
- def password_entered():
25
- """Checks whether a password entered by the user is correct."""
26
- username = st.session_state["username"]
27
- password = st.session_state["password"]
28
-
29
- if username in USERS and hmac.compare_digest(USERS[username], password):
30
- st.session_state["password_correct"] = True
31
- st.session_state["authenticated_user"] = username
32
- del st.session_state["password"] # Don't store passwords
33
- else:
34
- st.session_state["password_correct"] = False
35
-
36
- # Return True if password is validated
37
- if st.session_state.get("password_correct", False):
38
- return True
39
-
40
- # Show login form
41
- st.markdown("## 🔐 Login Required")
42
- st.markdown("Please enter your credentials to access the Text Classification App")
43
-
44
- col1, col2, col3 = st.columns([1, 2, 1])
45
- with col2:
46
- st.text_input("Username", key="username", placeholder="Enter username")
47
- st.text_input("Password", type="password", key="password", placeholder="Enter password")
48
-
49
- if st.button("Login", use_container_width=True):
50
- password_entered()
51
-
52
- # Show demo credentials
53
- with st.expander("Demo Credentials"):
54
- st.info("""
55
- **Demo Account:**
56
- - Username: `demo`
57
- - Password: `demo123`
58
-
59
- **Admin Account:**
60
- - Username: `admin`
61
- - Password: `admin123`
62
- """)
63
-
64
- if st.session_state.get("password_correct", False) == False:
65
- st.error("😞 Username or password incorrect")
66
-
67
- return False
68
-
69
  # Utility functions
70
  def save_artifacts(obj, folder_name, file_name):
71
  """Save artifacts like encoders and vectorizers"""
72
- try:
73
- os.makedirs(folder_name, exist_ok=True)
74
- with open(os.path.join(folder_name, file_name), 'wb') as f:
75
- pickle.dump(obj, f)
76
- return True
77
- except Exception as e:
78
- st.error(f"Error saving {file_name}: {str(e)}")
79
- return False
80
 
81
  def load_artifacts(folder_name, file_name):
82
  """Load saved artifacts"""
@@ -84,10 +23,7 @@ def load_artifacts(folder_name, file_name):
84
  with open(os.path.join(folder_name, file_name), 'rb') as f:
85
  return pickle.load(f)
86
  except FileNotFoundError:
87
- st.warning(f"File {file_name} not found in {folder_name} folder")
88
- return None
89
- except Exception as e:
90
- st.error(f"Error loading {file_name}: {str(e)}")
91
  return None
92
 
93
  def load_model(model_name):
@@ -98,32 +34,6 @@ def load_model(model_name):
98
  except FileNotFoundError:
99
  st.error(f"Model {model_name} not found. Please train a model first.")
100
  return None
101
- except Exception as e:
102
- st.error(f"Error loading model: {str(e)}")
103
- return None
104
-
105
- def safe_file_upload(uploaded_file, encoding='utf-8'):
106
- """Safely read uploaded file with multiple encoding attempts"""
107
- if uploaded_file is None:
108
- return None
109
-
110
- encodings_to_try = [encoding, 'latin1', 'cp1252', 'iso-8859-1']
111
-
112
- for enc in encodings_to_try:
113
- try:
114
- # Reset file pointer
115
- uploaded_file.seek(0)
116
- df = pd.read_csv(uploaded_file, encoding=enc)
117
- st.success(f"File loaded successfully with {enc} encoding")
118
- return df
119
- except UnicodeDecodeError:
120
- continue
121
- except Exception as e:
122
- st.error(f"Error reading file with {enc}: {str(e)}")
123
- continue
124
-
125
- st.error("Could not read file with any common encoding. Please check your file format.")
126
- return None
127
 
128
  def predict_text(model_name, text, vectorizer_type="tfidf"):
129
  """Make prediction on new text"""
@@ -171,392 +81,256 @@ def predict_text(model_name, text, vectorizer_type="tfidf"):
171
  st.error(f"Error during prediction: {str(e)}")
172
  return None, None
173
 
174
- # Main App Logic
175
- def main_app():
176
- # Header with user info
177
- col1, col2 = st.columns([3, 1])
178
- with col1:
179
- st.title('🤖 No Code Text Classification App')
180
- st.write('Understand the behavior of your text data and train a model to classify the text data')
181
- with col2:
182
- st.markdown(f"**👤 User:** {st.session_state.get('authenticated_user', 'Unknown')}")
183
- if st.button("Logout", type="secondary"):
184
- for key in list(st.session_state.keys()):
185
- del st.session_state[key]
186
- st.rerun()
187
 
188
- # Sidebar
189
- section = st.sidebar.radio("Choose Section", ["📊 Data Analysis", "🚀 Train Model", "🔮 Predictions"])
190
 
191
- # Upload Data with improved error handling
192
- st.sidebar.subheader("📁 Upload Your Dataset")
193
-
194
- # File encoding selection
195
- encoding_choice = st.sidebar.selectbox(
196
- "File Encoding",
197
- ["utf-8", "latin1", "cp1252", "iso-8859-1"],
198
- help="If file upload fails, try different encodings"
199
- )
200
-
201
- train_data = st.sidebar.file_uploader(
202
- "Upload training data",
203
- type=["csv"],
204
- help="Upload a CSV file with your training data"
205
- )
206
-
207
- test_data = st.sidebar.file_uploader(
208
- "Upload test data (optional)",
209
- type=["csv"],
210
- help="Optional: Upload separate test data"
211
- )
212
 
213
- # Global variables to store data and settings
214
- if 'vectorizer_type' not in st.session_state:
215
- st.session_state.vectorizer_type = "tfidf"
 
 
 
 
 
 
 
 
 
 
 
 
216
 
217
- train_df = None
218
- test_df = None
219
- info = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
 
221
- if train_data is not None:
222
- with st.spinner("Loading training data..."):
223
- train_df = safe_file_upload(train_data, encoding_choice)
 
 
224
 
225
- if train_df is not None:
226
- try:
227
- if test_data is not None:
228
- test_df = safe_file_upload(test_data, encoding_choice)
229
-
230
- st.sidebar.success(f"✅ Training data loaded: {train_df.shape[0]} rows, {train_df.shape[1]} columns")
231
- st.write("📋 Training Data Preview:")
232
- st.dataframe(train_df.head(3), use_container_width=True)
233
-
234
- columns = train_df.columns.tolist()
235
- text_data = st.sidebar.selectbox("📝 Choose the text column:", columns)
236
- target = st.sidebar.selectbox("🎯 Choose the target column:", columns)
237
 
238
- # Process data
239
- if text_data and target and text_data != target:
240
- with st.spinner("Processing data..."):
241
- info = Informations(train_df, text_data, target)
242
- train_df['clean_text'] = info.clean_text()
243
- train_df['text_length'] = info.text_length()
244
-
245
- # Handle label encoding manually if the class doesn't store encoder
246
- from sklearn.preprocessing import LabelEncoder
247
- label_encoder = LabelEncoder()
248
- train_df['target'] = label_encoder.fit_transform(train_df[target])
249
-
250
- # Save label encoder for later use
251
- if save_artifacts(label_encoder, "artifacts", "encoder.pkl"):
252
- st.sidebar.success("✅ Data processed successfully")
253
- else:
254
- st.sidebar.warning("Please select different columns for text and target")
255
-
256
- except Exception as e:
257
- st.error(f"❌ Error processing data: {str(e)}")
258
- train_df = None
259
- info = None
260
 
261
- # Data Analysis Section
262
- if section == "📊 Data Analysis":
263
- st.header("📊 Data Analysis & Insights")
264
-
265
- if train_data is not None and train_df is not None and info is not None:
266
- try:
267
- # Create tabs for better organization
268
- tab1, tab2, tab3 = st.tabs(["📈 Basic Stats", "📝 Text Analysis", "📊 Visualizations"])
269
-
270
- with tab1:
271
- col1, col2, col3 = st.columns(3)
272
-
273
- with col1:
274
- st.metric("📊 Data Shape", f"{info.shape()[0]} x {info.shape()[1]}")
275
-
276
- with col2:
277
- imbalance_info = info.class_imbalanced()
278
- st.metric("⚖️ Class Balance", "Balanced" if not imbalance_info else "Imbalanced")
279
-
280
- with col3:
281
- missing_info = info.missing_values()
282
- total_missing = sum(missing_info.values()) if isinstance(missing_info, dict) else 0
283
- st.metric("❌ Missing Values", str(total_missing))
284
-
285
- st.subheader("📋 Processed Data Preview")
286
- st.dataframe(train_df[['clean_text', 'text_length', 'target']].head(), use_container_width=True)
287
-
288
- with tab2:
289
- st.subheader("📏 Text Length Analysis")
290
- text_analysis = info.analysis_text_length('text_length')
291
-
292
- # Display stats in a nice format
293
- stats_col1, stats_col2 = st.columns(2)
294
- with stats_col1:
295
- st.json(text_analysis)
296
-
297
- with stats_col2:
298
- correlation = train_df[['text_length', 'target']].corr().iloc[0, 1]
299
- st.metric("🔗 Text Length-Target Correlation", f"{correlation:.4f}")
300
 
301
- with tab3:
302
- st.subheader("📊 Data Visualizations")
303
- vis = Visualizations(train_df, text_data, target)
304
-
305
- col1, col2 = st.columns(2)
306
- with col1:
307
- st.write("**Class Distribution**")
308
- vis.class_distribution()
309
-
310
- with col2:
311
- st.write("**Text Length Distribution**")
312
- vis.text_length_distribution()
313
 
314
- except Exception as e:
315
- st.error(f"❌ Error in data analysis: {str(e)}")
316
- else:
317
- st.info("👆 Please upload training data in the sidebar to get insights")
 
318
 
319
- # Train Model Section
320
- elif section == "🚀 Train Model":
321
- st.header("🚀 Train Classification Model")
322
-
323
- if train_data is not None and train_df is not None:
324
- try:
325
- # Create two columns for model selection
326
- col1, col2 = st.columns(2)
327
 
328
- with col1:
329
- st.subheader("🤖 Choose Model")
330
- model = st.radio("Select Algorithm:", [
331
- "Logistic Regression", "Decision Tree",
332
- "Random Forest", "Linear SVC", "SVC",
333
- "Multinomial Naive Bayes", "Gaussian Naive Bayes"
334
- ])
335
-
336
- with col2:
337
- st.subheader("🔤 Choose Vectorizer")
338
- vectorizer_choice = st.radio("Select Vectorizer:", ["Tfidf Vectorizer", "Count Vectorizer"])
339
 
340
- # Initialize vectorizer
341
- if vectorizer_choice == "Tfidf Vectorizer":
342
- vectorizer = TfidfVectorizer(max_features=10000)
343
- st.session_state.vectorizer_type = "tfidf"
344
- else:
345
- vectorizer = CountVectorizer(max_features=10000)
346
- st.session_state.vectorizer_type = "count"
347
 
348
- st.subheader("📋 Training Data Preview")
349
- st.dataframe(train_df[['clean_text', 'target']].head(3), use_container_width=True)
350
-
351
- # Vectorize text data
352
- with st.spinner("Preparing data..."):
353
- X = vectorizer.fit_transform(train_df['clean_text'])
354
- y = train_df['target']
355
-
356
- # Split data
357
- X_train, X_test, y_train, y_test = process.split_data(X, y)
358
- st.success(f"✅ Data prepared - Train: {X_train.shape}, Test: {X_test.shape}")
 
 
 
 
 
 
 
359
 
360
- # Save vectorizer for later use
361
- vectorizer_filename = f"{st.session_state.vectorizer_type}_vectorizer.pkl"
362
- save_artifacts(vectorizer, "artifacts", vectorizer_filename)
 
 
 
 
 
 
 
 
 
 
 
 
363
 
364
- if st.button("🚀 Start Training", type="primary", use_container_width=True):
365
- progress_bar = st.progress(0)
366
- status_text = st.empty()
367
-
368
- with st.spinner(f"Training {model} model..."):
369
- status_text.text("Initializing model...")
370
- progress_bar.progress(20)
371
-
372
- models = Models(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
373
-
374
- status_text.text("Training in progress...")
375
- progress_bar.progress(50)
376
-
377
- # Train selected model
378
- if model == "Logistic Regression":
379
- models.LogisticRegression()
380
- elif model == "Decision Tree":
381
- models.DecisionTree()
382
- elif model == "Linear SVC":
383
- models.LinearSVC()
384
- elif model == "SVC":
385
- models.SVC()
386
- elif model == "Multinomial Naive Bayes":
387
- models.MultinomialNB()
388
- elif model == "Random Forest":
389
- models.RandomForestClassifier()
390
- elif model == "Gaussian Naive Bayes":
391
- models.GaussianNB()
392
-
393
- progress_bar.progress(100)
394
- status_text.text("Training completed!")
395
-
396
- st.success("🎉 Model training completed successfully!")
397
- st.balloons()
398
- st.info("💡 You can now use the 'Predictions' section to classify new text.")
399
 
400
- except Exception as e:
401
- st.error(f"Error in model training: {str(e)}")
402
- st.exception(e)
403
- else:
404
- st.info("👆 Please upload training data in the sidebar to train a model")
405
 
406
- # Predictions Section
407
- elif section == "🔮 Predictions":
408
- st.header("🔮 Text Classification Predictions")
 
 
 
 
 
409
 
410
- # Check if models exist
411
- if os.path.exists("models") and os.listdir("models"):
412
- tab1, tab2 = st.tabs(["🎯 Single Prediction", "📊 Batch Predictions"])
413
-
414
- with tab1:
415
- st.subheader("🎯 Classify Single Text")
416
-
417
- # Text input for prediction
418
- text_input = st.text_area("Enter the text to classify:", height=100, placeholder="Type or paste your text here...")
419
-
420
- # Model selection
421
- available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
422
-
423
- if available_models:
424
- selected_model = st.selectbox("🤖 Choose the trained model:", available_models)
425
-
426
- # Prediction button
427
- if st.button("🔮 Predict", key="single_predict", type="primary"):
428
- if text_input.strip():
429
- with st.spinner("Making prediction..."):
430
- predicted_label, prediction_proba = predict_text(
431
- selected_model,
432
- text_input,
433
- st.session_state.get('vectorizer_type', 'tfidf')
434
- )
435
-
436
- if predicted_label is not None:
437
- st.success("🎉 Prediction completed!")
438
-
439
- # Display results
440
- st.markdown("### 📋 Prediction Results")
441
-
442
- # Create result container
443
- result_container = st.container()
444
- with result_container:
445
- st.markdown(f"**📝 Input Text:** {text_input}")
446
- st.markdown(f"**🏷️ Predicted Class:** `{predicted_label}`")
447
-
448
- # Display probabilities if available
449
- if prediction_proba is not None:
450
- st.markdown("**📊 Class Probabilities:**")
451
-
452
- # Load encoder to get class names
453
- encoder = load_artifacts("artifacts", "encoder.pkl")
454
- if encoder is not None:
455
- classes = encoder.classes_
456
- prob_df = pd.DataFrame({
457
- 'Class': classes,
458
- 'Probability': prediction_proba
459
- }).sort_values('Probability', ascending=False)
460
-
461
- st.bar_chart(prob_df.set_index('Class'))
462
- st.dataframe(prob_df, use_container_width=True)
463
- else:
464
- st.warning("⚠️ Please enter some text to classify")
465
- else:
466
- st.warning("⚠️ No trained models found. Please train a model first.")
467
 
468
- with tab2:
469
- st.subheader("📊 Batch Classification")
470
-
471
- uploaded_file = st.file_uploader(
472
- "Upload a CSV file with text to classify",
473
- type=['csv'],
474
- help="Upload a CSV file containing text data for batch classification"
475
- )
476
-
477
- if uploaded_file is not None:
478
- try:
479
- batch_df = safe_file_upload(uploaded_file)
480
- if batch_df is not None:
481
- st.write("📋 Uploaded data preview:")
482
- st.dataframe(batch_df.head(), use_container_width=True)
483
-
484
- # Select text column
485
- text_column = st.selectbox("📝 Select the text column:", batch_df.columns.tolist())
486
 
487
- available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
488
- batch_model = st.selectbox("🤖 Choose model for batch prediction:", available_models, key="batch_model")
 
 
489
 
490
- if st.button("🚀 Run Batch Predictions", key="batch_predict", type="primary"):
491
- progress_bar = st.progress(0)
492
- status_text = st.empty()
493
 
494
- with st.spinner("Processing batch predictions..."):
495
- predictions = []
496
- total_texts = len(batch_df)
497
-
498
- for i, text in enumerate(batch_df[text_column]):
499
- status_text.text(f"Processing {i+1}/{total_texts} texts...")
500
- progress_bar.progress((i+1)/total_texts)
501
-
502
- pred, _ = predict_text(
503
- batch_model,
504
- str(text),
505
- st.session_state.get('vectorizer_type', 'tfidf')
506
- )
507
- predictions.append(pred if pred is not None else "Error")
508
-
509
- batch_df['Predicted_Class'] = predictions
510
-
511
- st.success("🎉 Batch predictions completed!")
512
- st.write("📊 Results:")
513
- st.dataframe(batch_df[[text_column, 'Predicted_Class']], use_container_width=True)
514
 
515
- # Download results
516
- csv = batch_df.to_csv(index=False)
517
- st.download_button(
518
- label="📥 Download predictions as CSV",
519
- data=csv,
520
- file_name="batch_predictions.csv",
521
- mime="text/csv",
522
- type="primary"
523
- )
524
- except Exception as e:
525
- st.error(f"❌ Error in batch prediction: {str(e)}")
526
  else:
527
- st.info("⚠️ No trained models found. Please go to 'Train Model' section to train a model first.")
528
-
529
- # Main execution
530
- def main():
531
- # Page config
532
- st.set_page_config(
533
- page_title="Text Classification App",
534
- page_icon="🤖",
535
- layout="wide",
536
- initial_sidebar_state="expanded"
537
- )
538
 
539
- # Custom CSS for better styling
540
- st.markdown("""
541
- <style>
542
- .main {
543
- padding-top: 1rem;
544
- }
545
- .stAlert {
546
- margin-top: 1rem;
547
- }
548
- .metric-container {
549
- background-color: #f0f2f6;
550
- padding: 1rem;
551
- border-radius: 0.5rem;
552
- margin: 0.5rem 0;
553
- }
554
- </style>
555
- """, unsafe_allow_html=True)
556
 
557
- # Check authentication
558
- if check_password():
559
- main_app()
560
-
561
- if __name__ == "__main__":
562
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  from NoCodeTextClassifier.models import Models
9
  import os
10
  import pickle
 
 
11
  from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  # Utility functions
14
  def save_artifacts(obj, folder_name, file_name):
15
  """Save artifacts like encoders and vectorizers"""
16
+ os.makedirs(folder_name, exist_ok=True)
17
+ with open(os.path.join(folder_name, file_name), 'wb') as f:
18
+ pickle.dump(obj, f)
 
 
 
 
 
19
 
20
  def load_artifacts(folder_name, file_name):
21
  """Load saved artifacts"""
 
23
  with open(os.path.join(folder_name, file_name), 'rb') as f:
24
  return pickle.load(f)
25
  except FileNotFoundError:
26
+ st.error(f"File {file_name} not found in {folder_name} folder")
 
 
 
27
  return None
28
 
29
  def load_model(model_name):
 
34
  except FileNotFoundError:
35
  st.error(f"Model {model_name} not found. Please train a model first.")
36
  return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
  def predict_text(model_name, text, vectorizer_type="tfidf"):
39
  """Make prediction on new text"""
 
81
  st.error(f"Error during prediction: {str(e)}")
82
  return None, None
83
 
84
+ # Streamlit App
85
+ st.title('No Code Text Classification App')
86
+ st.write('Understand the behavior of your text data and train a model to classify the text data')
 
 
 
 
 
 
 
 
 
 
87
 
88
+ # Sidebar
89
+ section = st.sidebar.radio("Choose Section", ["Data Analysis", "Train Model", "Predictions"])
90
 
91
+ # Upload Data
92
+ st.sidebar.subheader("Upload Your Dataset")
93
+ train_data = st.sidebar.file_uploader("Upload training data", type=["csv"])
94
+ test_data = st.sidebar.file_uploader("Upload test data (optional)", type=["csv"])
95
+
96
+ # Global variables to store data and settings
97
+ if 'vectorizer_type' not in st.session_state:
98
+ st.session_state.vectorizer_type = "tfidf"
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
+ if train_data is not None:
101
+ try:
102
+ train_df = pd.read_csv(train_data, encoding='latin1')
103
+
104
+ if test_data is not None:
105
+ test_df = pd.read_csv(test_data, encoding='latin1')
106
+ else:
107
+ test_df = None
108
+
109
+ st.write("Training Data Preview:")
110
+ st.write(train_df.head(3))
111
+
112
+ columns = train_df.columns.tolist()
113
+ text_data = st.sidebar.selectbox("Choose the text column:", columns)
114
+ target = st.sidebar.selectbox("Choose the target column:", columns)
115
 
116
+ # Process data
117
+ info = Informations(train_df, text_data, target)
118
+ train_df['clean_text'] = info.clean_text()
119
+ train_df['text_length'] = info.text_length()
120
+
121
+ # Handle label encoding manually if the class doesn't store encoder
122
+ from sklearn.preprocessing import LabelEncoder
123
+ label_encoder = LabelEncoder()
124
+ train_df['target'] = label_encoder.fit_transform(train_df[target])
125
+
126
+ # Save label encoder for later use
127
+ os.makedirs("artifacts", exist_ok=True)
128
+ save_artifacts(label_encoder, "artifacts", "encoder.pkl")
129
+
130
+ except Exception as e:
131
+ st.error(f"Error loading data: {str(e)}")
132
+ train_df = None
133
+ info = None
134
 
135
+ # Data Analysis Section
136
+ if section == "Data Analysis":
137
+ if train_data is not None and train_df is not None:
138
+ try:
139
+ st.subheader("Get Insights from the Data")
140
 
141
+ st.write("Data Shape:", info.shape())
142
+ st.write("Class Imbalance:", info.class_imbalanced())
143
+ st.write("Missing Values:", info.missing_values())
 
 
 
 
 
 
 
 
 
144
 
145
+ st.write("Processed Data Preview:")
146
+ st.write(train_df[['clean_text', 'text_length', 'target']].head(3))
147
+
148
+ st.markdown("**Text Length Analysis**")
149
+ st.write(info.analysis_text_length('text_length'))
150
+
151
+ # Calculate correlation manually since we handled encoding separately
152
+ correlation = train_df[['text_length', 'target']].corr().iloc[0, 1]
153
+ st.write(f"Correlation between Text Length and Target: {correlation:.4f}")
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
+ st.subheader("Visualizations")
156
+ vis = Visualizations(train_df, text_data, target)
157
+ vis.class_distribution()
158
+ vis.text_length_distribution()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
 
160
+ except Exception as e:
161
+ st.error(f"Error in data analysis: {str(e)}")
162
+ else:
163
+ st.warning("Please upload training data to get insights")
 
 
 
 
 
 
 
 
164
 
165
+ # Train Model Section
166
+ elif section == "Train Model":
167
+ if train_data is not None and train_df is not None:
168
+ try:
169
+ st.subheader("Train a Model")
170
 
171
+ # Create two columns for model selection
172
+ col1, col2 = st.columns(2)
 
 
 
 
 
 
173
 
174
+ with col1:
175
+ model = st.radio("Choose the Model", [
176
+ "Logistic Regression", "Decision Tree",
177
+ "Random Forest", "Linear SVC", "SVC",
178
+ "Multinomial Naive Bayes", "Gaussian Naive Bayes"
179
+ ])
180
+
181
+ with col2:
182
+ vectorizer_choice = st.radio("Choose Vectorizer", ["Tfidf Vectorizer", "Count Vectorizer"])
 
 
183
 
184
+ # Initialize vectorizer
185
+ if vectorizer_choice == "Tfidf Vectorizer":
186
+ vectorizer = TfidfVectorizer(max_features=10000)
187
+ st.session_state.vectorizer_type = "tfidf"
188
+ else:
189
+ vectorizer = CountVectorizer(max_features=10000)
190
+ st.session_state.vectorizer_type = "count"
191
 
192
+ st.write("Training Data Preview:")
193
+ st.write(train_df[['clean_text', 'target']].head(3))
194
+
195
+ # Vectorize text data
196
+ X = vectorizer.fit_transform(train_df['clean_text'])
197
+ y = train_df['target']
198
+
199
+ # Split data
200
+ X_train, X_test, y_train, y_test = process.split_data(X, y)
201
+ st.write(f"Data split - Train: {X_train.shape}, Test: {X_test.shape}")
202
+
203
+ # Save vectorizer for later use
204
+ vectorizer_filename = f"{st.session_state.vectorizer_type}_vectorizer.pkl"
205
+ save_artifacts(vectorizer, "artifacts", vectorizer_filename)
206
+
207
+ if st.button("Start Training"):
208
+ with st.spinner("Training model..."):
209
+ models = Models(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
210
 
211
+ # Train selected model
212
+ if model == "Logistic Regression":
213
+ models.LogisticRegression()
214
+ elif model == "Decision Tree":
215
+ models.DecisionTree()
216
+ elif model == "Linear SVC":
217
+ models.LinearSVC()
218
+ elif model == "SVC":
219
+ models.SVC()
220
+ elif model == "Multinomial Naive Bayes":
221
+ models.MultinomialNB()
222
+ elif model == "Random Forest":
223
+ models.RandomForestClassifier()
224
+ elif model == "Gaussian Naive Bayes":
225
+ models.GaussianNB()
226
 
227
+ st.success("Model training completed!")
228
+ st.info("You can now use the 'Predictions' section to classify new text.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
 
230
+ except Exception as e:
231
+ st.error(f"Error in model training: {str(e)}")
232
+ else:
233
+ st.warning("Please upload training data to train a model")
 
234
 
235
+ # Predictions Section
236
+ elif section == "Predictions":
237
+ st.subheader("Perform Predictions on New Text")
238
+
239
+ # Check if models exist
240
+ if os.path.exists("models") and os.listdir("models"):
241
+ # Text input for prediction
242
+ text_input = st.text_area("Enter the text to classify:", height=100)
243
 
244
+ # Model selection
245
+ available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
246
+
247
+ if available_models:
248
+ selected_model = st.selectbox("Choose the trained model:", available_models)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
 
250
+ # Prediction button
251
+ if st.button("Predict", key="single_predict"):
252
+ if text_input.strip():
253
+ with st.spinner("Making prediction..."):
254
+ predicted_label, prediction_proba = predict_text(
255
+ selected_model,
256
+ text_input,
257
+ st.session_state.get('vectorizer_type', 'tfidf')
258
+ )
259
+
260
+ if predicted_label is not None:
261
+ st.success("Prediction completed!")
 
 
 
 
 
 
262
 
263
+ # Display results
264
+ st.markdown("### Prediction Results")
265
+ st.markdown(f"**Input Text:** {text_input}")
266
+ st.markdown(f"**Predicted Class:** {predicted_label}")
267
 
268
+ # Display probabilities if available
269
+ if prediction_proba is not None:
270
+ st.markdown("**Class Probabilities:**")
271
 
272
+ # Load encoder to get class names
273
+ encoder = load_artifacts("artifacts", "encoder.pkl")
274
+ if encoder is not None:
275
+ classes = encoder.classes_
276
+ prob_df = pd.DataFrame({
277
+ 'Class': classes,
278
+ 'Probability': prediction_proba
279
+ }).sort_values('Probability', ascending=False)
 
 
 
 
 
 
 
 
 
 
 
 
280
 
281
+ st.bar_chart(prob_df.set_index('Class'))
282
+ st.dataframe(prob_df)
283
+ else:
284
+ st.warning("Please enter some text to classify")
 
 
 
 
 
 
 
285
  else:
286
+ st.warning("No trained models found. Please train a model first.")
287
+ else:
288
+ st.warning("No trained models found. Please go to 'Train Model' section to train a model first.")
289
+
290
+ # Option to classify multiple texts
291
+ st.markdown("---")
292
+ st.subheader("Batch Predictions")
 
 
 
 
293
 
294
+ uploaded_file = st.file_uploader("Upload a CSV file with text to classify", type=['csv'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
295
 
296
+ if uploaded_file is not None:
297
+ try:
298
+ batch_df = pd.read_csv(uploaded_file, encoding='latin1')
299
+ st.write("Uploaded data preview:")
300
+ st.write(batch_df.head())
301
+
302
+ # Select text column
303
+ text_column = st.selectbox("Select the text column:", batch_df.columns.tolist())
304
+
305
+ if os.path.exists("models") and os.listdir("models"):
306
+ available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
307
+ batch_model = st.selectbox("Choose model for batch prediction:", available_models, key="batch_model")
308
+
309
+ if st.button("Run Batch Predictions", key="batch_predict"):
310
+ with st.spinner("Processing batch predictions..."):
311
+ predictions = []
312
+
313
+ for text in batch_df[text_column]:
314
+ pred, _ = predict_text(
315
+ batch_model,
316
+ str(text),
317
+ st.session_state.get('vectorizer_type', 'tfidf')
318
+ )
319
+ predictions.append(pred if pred is not None else "Error")
320
+
321
+ batch_df['Predicted_Class'] = predictions
322
+
323
+ st.success("Batch predictions completed!")
324
+ st.write("Results:")
325
+ st.write(batch_df[[text_column, 'Predicted_Class']])
326
+
327
+ # Download results
328
+ csv = batch_df.to_csv(index=False)
329
+ st.download_button(
330
+ label="Download predictions as CSV",
331
+ data=csv,
332
+ file_name="batch_predictions.csv",
333
+ mime="text/csv"
334
+ )
335
+ except Exception as e:
336
+ st.error(f"Error in batch prediction: {str(e)}")