Alamgirapi's picture
Update app.py
1ea5800 verified
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from NoCodeTextClassifier.EDA import Informations, Visualizations
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from NoCodeTextClassifier.preprocessing import process, TextCleaner, Vectorization
from NoCodeTextClassifier.models import Models
import os
import pickle
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Utility functions
def save_artifacts(obj, folder_name, file_name):
"""Save artifacts like encoders and vectorizers"""
os.makedirs(folder_name, exist_ok=True)
with open(os.path.join(folder_name, file_name), 'wb') as f:
pickle.dump(obj, f)
def load_artifacts(folder_name, file_name):
"""Load saved artifacts"""
try:
with open(os.path.join(folder_name, file_name), 'rb') as f:
return pickle.load(f)
except FileNotFoundError:
st.error(f"File {file_name} not found in {folder_name} folder")
return None
def load_model(model_name):
"""Load trained model"""
try:
with open(os.path.join('models', model_name), 'rb') as f:
return pickle.load(f)
except FileNotFoundError:
st.error(f"Model {model_name} not found. Please train a model first.")
return None
def predict_text(model_name, text, vectorizer_type="tfidf"):
"""Make prediction on new text"""
try:
# Load model
model = load_model(model_name)
if model is None:
return None, None
# Load vectorizer
vectorizer_file = f"{vectorizer_type}_vectorizer.pkl"
vectorizer = load_artifacts("artifacts", vectorizer_file)
if vectorizer is None:
return None, None
# Load label encoder
encoder = load_artifacts("artifacts", "encoder.pkl")
if encoder is None:
return None, None
# Clean and vectorize text
text_cleaner = TextCleaner()
clean_text = text_cleaner.clean_text(text)
# Transform text using the same vectorizer used during training
text_vector = vectorizer.transform([clean_text])
# Make prediction
prediction = model.predict(text_vector)
prediction_proba = None
# Get prediction probabilities if available
if hasattr(model, 'predict_proba'):
try:
prediction_proba = model.predict_proba(text_vector)[0]
except:
pass
# Decode prediction
predicted_label = encoder.inverse_transform(prediction)[0]
return predicted_label, prediction_proba
except Exception as e:
st.error(f"Error during prediction: {str(e)}")
return None, None
# Streamlit App
st.title('No Code Text Classification App')
st.write('Understand the behavior of your text data and train a model to classify the text data')
# Sidebar
section = st.sidebar.radio("Choose Section", ["Data Analysis", "Train Model", "Predictions"])
# Upload Data
st.sidebar.subheader("Upload Your Dataset")
train_data = st.sidebar.file_uploader("Upload training data", type=["csv"])
test_data = st.sidebar.file_uploader("Upload test data (optional)", type=["csv"])
# Global variables to store data and settings
if 'vectorizer_type' not in st.session_state:
st.session_state.vectorizer_type = "tfidf"
if train_data is not None:
try:
train_df = pd.read_csv(train_data, encoding='latin1')
if test_data is not None:
test_df = pd.read_csv(test_data, encoding='latin1')
else:
test_df = None
st.write("Training Data Preview:")
st.write(train_df.head(3))
columns = train_df.columns.tolist()
text_data = st.sidebar.selectbox("Choose the text column:", columns)
target = st.sidebar.selectbox("Choose the target column:", columns)
# Process data
info = Informations(train_df, text_data, target)
train_df['clean_text'] = info.clean_text()
train_df['text_length'] = info.text_length()
# Handle label encoding manually if the class doesn't store encoder
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
train_df['target'] = label_encoder.fit_transform(train_df[target])
# Save label encoder for later use
os.makedirs("artifacts", exist_ok=True)
save_artifacts(label_encoder, "artifacts", "encoder.pkl")
except Exception as e:
st.error(f"Error loading data: {str(e)}")
train_df = None
info = None
# Data Analysis Section
if section == "Data Analysis":
if train_data is not None and train_df is not None:
try:
st.subheader("Get Insights from the Data")
st.write("Data Shape:", info.shape())
st.write("Class Imbalance:", info.class_imbalanced())
st.write("Missing Values:", info.missing_values())
st.write("Processed Data Preview:")
st.write(train_df[['clean_text', 'text_length', 'target']].head(3))
st.markdown("**Text Length Analysis**")
st.write(info.analysis_text_length('text_length'))
# Calculate correlation manually since we handled encoding separately
correlation = train_df[['text_length', 'target']].corr().iloc[0, 1]
st.write(f"Correlation between Text Length and Target: {correlation:.4f}")
st.subheader("Visualizations")
vis = Visualizations(train_df, text_data, target)
vis.class_distribution()
vis.text_length_distribution()
except Exception as e:
st.error(f"Error in data analysis: {str(e)}")
else:
st.warning("Please upload training data to get insights")
# Train Model Section
elif section == "Train Model":
if train_data is not None and train_df is not None:
try:
st.subheader("Train a Model")
# Create two columns for model selection
col1, col2 = st.columns(2)
with col1:
model = st.radio("Choose the Model", [
"Logistic Regression", "Decision Tree",
"Random Forest", "Linear SVC", "SVC",
"Multinomial Naive Bayes", "Gaussian Naive Bayes"
])
with col2:
vectorizer_choice = st.radio("Choose Vectorizer", ["Tfidf Vectorizer", "Count Vectorizer"])
# Initialize vectorizer
if vectorizer_choice == "Tfidf Vectorizer":
vectorizer = TfidfVectorizer(max_features=10000)
st.session_state.vectorizer_type = "tfidf"
else:
vectorizer = CountVectorizer(max_features=10000)
st.session_state.vectorizer_type = "count"
st.write("Training Data Preview:")
st.write(train_df[['clean_text', 'target']].head(3))
# Vectorize text data
X = vectorizer.fit_transform(train_df['clean_text'])
y = train_df['target']
# Split data
X_train, X_test, y_train, y_test = process.split_data(X, y)
st.write(f"Data split - Train: {X_train.shape}, Test: {X_test.shape}")
# Save vectorizer for later use
vectorizer_filename = f"{st.session_state.vectorizer_type}_vectorizer.pkl"
save_artifacts(vectorizer, "artifacts", vectorizer_filename)
if st.button("Start Training"):
with st.spinner("Training model..."):
models = Models(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
# Train selected model
if model == "Logistic Regression":
models.LogisticRegression()
elif model == "Decision Tree":
models.DecisionTree()
elif model == "Linear SVC":
models.LinearSVC()
elif model == "SVC":
models.SVC()
elif model == "Multinomial Naive Bayes":
models.MultinomialNB()
elif model == "Random Forest":
models.RandomForestClassifier()
elif model == "Gaussian Naive Bayes":
models.GaussianNB()
st.success("Model training completed!")
st.info("You can now use the 'Predictions' section to classify new text.")
except Exception as e:
st.error(f"Error in model training: {str(e)}")
else:
st.warning("Please upload training data to train a model")
# Predictions Section
elif section == "Predictions":
st.subheader("Perform Predictions on New Text")
# Check if models exist
if os.path.exists("models") and os.listdir("models"):
# Text input for prediction
text_input = st.text_area("Enter the text to classify:", height=100)
# Model selection
available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
if available_models:
selected_model = st.selectbox("Choose the trained model:", available_models)
# Prediction button
if st.button("Predict", key="single_predict"):
if text_input.strip():
with st.spinner("Making prediction..."):
predicted_label, prediction_proba = predict_text(
selected_model,
text_input,
st.session_state.get('vectorizer_type', 'tfidf')
)
if predicted_label is not None:
st.success("Prediction completed!")
# Display results
st.markdown("### Prediction Results")
st.markdown(f"**Input Text:** {text_input}")
st.markdown(f"**Predicted Class:** {predicted_label}")
# Display probabilities if available
if prediction_proba is not None:
st.markdown("**Class Probabilities:**")
# Load encoder to get class names
encoder = load_artifacts("artifacts", "encoder.pkl")
if encoder is not None:
classes = encoder.classes_
prob_df = pd.DataFrame({
'Class': classes,
'Probability': prediction_proba
}).sort_values('Probability', ascending=False)
st.bar_chart(prob_df.set_index('Class'))
st.dataframe(prob_df)
else:
st.warning("Please enter some text to classify")
else:
st.warning("No trained models found. Please train a model first.")
else:
st.warning("No trained models found. Please go to 'Train Model' section to train a model first.")
# Option to classify multiple texts
st.markdown("---")
st.subheader("Batch Predictions")
uploaded_file = st.file_uploader("Upload a CSV file with text to classify", type=['csv'])
if uploaded_file is not None:
try:
batch_df = pd.read_csv(uploaded_file, encoding='latin1')
st.write("Uploaded data preview:")
st.write(batch_df.head())
# Select text column
text_column = st.selectbox("Select the text column:", batch_df.columns.tolist())
if os.path.exists("models") and os.listdir("models"):
available_models = [f for f in os.listdir("models") if f.endswith('.pkl')]
batch_model = st.selectbox("Choose model for batch prediction:", available_models, key="batch_model")
if st.button("Run Batch Predictions", key="batch_predict"):
with st.spinner("Processing batch predictions..."):
predictions = []
for text in batch_df[text_column]:
pred, _ = predict_text(
batch_model,
str(text),
st.session_state.get('vectorizer_type', 'tfidf')
)
predictions.append(pred if pred is not None else "Error")
batch_df['Predicted_Class'] = predictions
st.success("Batch predictions completed!")
st.write("Results:")
st.write(batch_df[[text_column, 'Predicted_Class']])
# Download results
csv = batch_df.to_csv(index=False)
st.download_button(
label="Download predictions as CSV",
data=csv,
file_name="batch_predictions.csv",
mime="text/csv"
)
except Exception as e:
st.error(f"Error in batch prediction: {str(e)}")