Spaces:

Alamgirapi
/

NoCodeTextClassifier

Sleeping

App Files Files Community

Alamgirapi commited on Aug 6, 2025

Commit

fe77a26

verified ·

1 Parent(s): 7c15afe

Upload folder NoCodeTextClassifier

Browse files

Files changed (14) hide show

NoCodeTextClassifier/EDA.py +87 -0
NoCodeTextClassifier/Inference.py +22 -0
NoCodeTextClassifier/__init__.py +0 -0
NoCodeTextClassifier/__pycache__/EDA.cpython-311.pyc +0 -0
NoCodeTextClassifier/__pycache__/__init__.cpython-311.pyc +0 -0
NoCodeTextClassifier/__pycache__/models.cpython-311.pyc +0 -0
NoCodeTextClassifier/__pycache__/preprocessing.cpython-311.pyc +0 -0
NoCodeTextClassifier/__pycache__/utils.cpython-311.pyc +0 -0
NoCodeTextClassifier/exception/__init__.py +0 -0
NoCodeTextClassifier/logger/__init__.py +0 -0
NoCodeTextClassifier/main.py +24 -0
NoCodeTextClassifier/models.py +107 -0
NoCodeTextClassifier/preprocessing.py +196 -0
NoCodeTextClassifier/utils.py +39 -0

NoCodeTextClassifier/EDA.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import seaborn as sns
+import matplotlib.pyplot as plt
+import streamlit as st
+import numpy as np
+from NoCodeTextClassifier.preprocessing import TextCleaner
+from sklearn.preprocessing import LabelEncoder
+class Informations:
+    def __init__(self, data, text_data, target):
+        self.data = data
+        self.text_data = text_data
+        self.target = target
+    def shape(self):
+        return self.data.shape
+    def class_imbalanced(self):
+        return self.data[self.target].value_counts()
+    def missing_values(self):
+        return self.data.isnull().sum()
+    def label_encoder(self):
+        encoder = LabelEncoder()
+        target = encoder.fit_transform(self.data[self.target])
+        return target
+    def clean_text(self):
+        text_cleaner = TextCleaner()
+        return self.data[self.text_data].apply(lambda x: text_cleaner.clean_text(x))
+    def text_length(self):
+        return self.data[self.text_data].apply(lambda x: len(x))
+    def analysis_text_length(self, text_length):
+        result = self.data[text_length].describe()
+        return result
+    def correlation(self, other_feature):
+        return self.data[other_feature].corr(self.data["target"])
+class Visualizations:
+    def __init__(self, data, text_data, target):
+        self.data = data
+        self.text_data = text_data
+        self.target = target
+    def simple_plot(self):
+        # Generate sample data
+        x = np.linspace(0, 10, 100)
+        y = np.sin(x)
+        fig, ax = plt.subplots()
+        ax.plot(x, y, label="Sine Wave")
+        ax.set_title("Matplotlib Plot in Streamlit")
+        ax.set_xlabel("X-axis")
+        ax.set_ylabel("Y-axis")
+        ax.legend()
+        # Display the plot in Streamlit
+        st.pyplot(fig)
+    def class_distribution(self):
+        fig, ax = plt.subplots()
+        sns.countplot(x=self.data[self.target], ax=ax,palette="pastel")
+        ax.set_title("Class Distribution")
+        ax.set_xlabel("Class")
+        ax.set_ylabel("Count")
+        st.pyplot(fig)
+    def text_length_distribution(self):
+        fig, ax = plt.subplots()
+        sns.histplot(self.data['text_length'], ax=ax, kde=True)
+        ax.set_title("Text Length Distribution")
+        ax.set_xlabel("Text Length")
+        ax.set_ylabel("Count")
+        st.pyplot(fig)

NoCodeTextClassifier/Inference.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from NoCodeTextClassifier import preprocessing
+from NoCodeTextClassifier import utils
+def prediction(text):
+    TextCleaner = preprocessing.TextCleaner()
+    clean_text = TextCleaner.clean_text(text)
+    vectorize = preprocessing.Vectorization()
+    vectorize_text = vectorize.TfidfVectorizer(eval=True, string=clean_text)
+    prediction = utils.prediction("DecisionTreeClassifier.pkl",vectorize_text)
+    encoder = utils.load_artifacts("artifacts","encoder.pkl")
+    output = encoder.inverse_transform(prediction)[0]
+    print(f"The prediction of given text : \t{output}")

NoCodeTextClassifier/__init__.py ADDED Viewed

File without changes

NoCodeTextClassifier/__pycache__/EDA.cpython-311.pyc ADDED Viewed

Binary file (6.13 kB). View file

NoCodeTextClassifier/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (160 Bytes). View file

NoCodeTextClassifier/__pycache__/models.cpython-311.pyc ADDED Viewed

Binary file (7.77 kB). View file

NoCodeTextClassifier/__pycache__/preprocessing.cpython-311.pyc ADDED Viewed

Binary file (12.6 kB). View file

NoCodeTextClassifier/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (2.73 kB). View file

NoCodeTextClassifier/exception/__init__.py ADDED Viewed

File without changes

NoCodeTextClassifier/logger/__init__.py ADDED Viewed

File without changes

NoCodeTextClassifier/main.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from NoCodeTextClassifier.preprocessing import *
+from NoCodeTextClassifier.models import *
+if __name__=="__main__":
+    data_path = r"C:\Users\abdullah\projects\NLP_project\NoCodeTextClassifier\ML Engineer\train.csv"
+    process = process(data_path,'email','class')
+    df = process.processing()
+    print(df.head())
+    Vectorization = Vectorization(df,'clean_text')
+    TfidfVectorizer = Vectorization.TfidfVectorizer(max_features= 10000)
+    print(TfidfVectorizer.toarray())
+    X_train, X_test, y_train, y_test = process.split_data(TfidfVectorizer.toarray(), df['labeled_target'])
+    print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
+    # print(X_train, y_train)
+    models = Models(X_train=X_train,X_test = X_test, y_train = y_train, y_test = y_test)
+    models.DecisionTree()

NoCodeTextClassifier/models.py ADDED Viewed

	@@ -0,0 +1,107 @@

+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.ensemble import GradientBoostingClassifier
+import xgboost as xgb
+import streamlit as st
+# import lightgbm as lgb
+# from catboost import CatBoostClassifier
+from NoCodeTextClassifier.utils import *
+class Models:
+    def __init__(self, X_train,X_test, y_train, y_test):
+        self.X_train = X_train
+        self.y_train = y_train
+        self.X_test = X_test
+        self.y_test = y_test
+        os.makedirs("models",exist_ok=True)
+    def LogisticRegression(self, **kwargs):
+        from sklearn.linear_model import LogisticRegression
+        model = LogisticRegression(**kwargs)
+        model.fit(self.X_train, self.y_train)
+        save_path = os.path.join("models", 'LogisticRegression.pkl')
+        with open(save_path, 'wb') as f:
+            pickle.dump(model, f)
+        print("Training Completed")
+        st.markdown("**Training Completed**")
+        evaluation('LogisticRegression.pkl', self.X_test,self.y_test)
+        print("Finished")
+    def DecisionTree(self, **kwargs):
+        from sklearn.tree import DecisionTreeClassifier
+        model = DecisionTreeClassifier(**kwargs)
+        model.fit(self.X_train, self.y_train)
+        save_path = os.path.join("models", 'DecisionTreeClassifier.pkl')
+        with open(save_path, 'wb') as f:
+            pickle.dump(model, f)
+        print("Training Completed")
+        evaluation('DecisionTreeClassifier.pkl', self.X_test,self.y_test)
+        print("Finished")
+    def LinearSVC(self, **kwargs):
+        from sklearn.svm import LinearSVC
+        model = LinearSVC(**kwargs)
+        model.fit(self.X_train, self.y_train)
+        save_path = os.path.join("models", 'LinearSVC.pkl')
+        with open(save_path, 'wb') as f:
+            pickle.dump(model, f)
+        evaluation('LinearSVC.pkl', self.X_test,self.y_test)
+        print("Training Completed")
+    def SVC(self, **kwargs):
+        from sklearn.svm import SVC
+        model = SVC(**kwargs)
+        model.fit(self.X_train, self.y_train)
+        save_path = os.path.join("models", 'SVC.pkl')
+        with open(save_path, 'wb') as f:
+            pickle.dump(model, f)
+        evaluation('SVC.pkl', self.X_test,self.y_test)
+        print("Training Completed")
+    def RandomForestClassifier(self, **kwargs):
+        from sklearn.ensemble import RandomForestClassifier
+        model = RandomForestClassifier(**kwargs)
+        model.fit(self.X_train, self.y_train)
+        save_path = os.path.join("models", 'RandomForestClassifier.pkl')
+        with open(save_path, 'wb') as f:
+            pickle.dump(model, f)
+        evaluation('RandomForestClassifier.pkl', self.X_test,self.y_test)
+        print("Training Completed")
+    def MultinomialNB(self, **kwargs):
+        from sklearn.naive_bayes import MultinomialNB
+        model = MultinomialNB(**kwargs)
+        model.fit(self.X_train, self.y_train)
+        save_path = os.path.join("models", 'MultinomialNB.pkl')
+        with open(save_path, 'wb') as f:
+            pickle.dump(model, f)
+        evaluation('MultinomialNB.pkl', self.X_test,self.y_test)
+        print("Training Completed")
+    def GaussianNB(self, **kwargs):
+        from sklearn.naive_bayes import GaussianNB
+        model = GaussianNB(**kwargs)
+        model.fit(self.X_train, self.y_train)
+        save_path = os.path.join("models", 'GaussianNB.pkl')
+        with open(save_path, 'wb') as f:
+            pickle.dump(model, f)
+        evaluation('GaussianNB.pkl', self.X_test,self.y_test)
+        print("Training Completed")

NoCodeTextClassifier/preprocessing.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import pandas as pd
+from sklearn.preprocessing import LabelEncoder
+from pathlib import Path
+import pickle
+import re
+import nltk
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+import string
+import os
+from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
+from NoCodeTextClassifier import utils
+import numpy as np
+import ssl
+# Fix SSL certificate issues for NLTK downloads
+try:
+    _create_unverified_https_context = ssl._create_unverified_context
+except AttributeError:
+    pass
+else:
+    ssl._create_default_https_context = _create_unverified_https_context
+# Download NLTK data with error handling
+def download_nltk_data():
+    try:
+        nltk.data.find('corpora/stopwords')
+    except LookupError:
+        nltk.download('stopwords', quiet=True)
+    try:
+        nltk.data.find('corpora/wordnet')
+    except LookupError:
+        nltk.download('wordnet', quiet=True)
+        nltk.download('omw-1.4', quiet=True)  # Required for newer NLTK versions
+# Download required NLTK data
+download_nltk_data()
+class TextCleaner:
+    '''Class for cleaning Text'''
+    def __init__(self, currency_symbols = r'[\$\£\€\¥\₹\¢\₽\₩\₪]', stop_words=None, lemmatizer=None):
+        self.currency_symbols = currency_symbols
+        if stop_words is None:
+            try:
+                self.stop_words = set(stopwords.words('english'))
+            except LookupError:
+                nltk.download('stopwords', quiet=True)
+                self.stop_words = set(stopwords.words('english'))
+        else:
+            self.stop_words = stop_words
+        if lemmatizer is None:
+            try:
+                self.lemmatizer = WordNetLemmatizer()
+                # Test the lemmatizer to ensure it works
+                test_word = self.lemmatizer.lemmatize('testing')
+            except (AttributeError, LookupError) as e:
+                print(f"WordNet lemmatizer initialization failed: {e}")
+                nltk.download('wordnet', quiet=True)
+                nltk.download('omw-1.4', quiet=True)
+                self.lemmatizer = WordNetLemmatizer()
+        else:
+            self.lemmatizer = lemmatizer
+    def remove_punctuation(self,text):
+        return text.translate(str.maketrans('', '', string.punctuation))
+    # Functions for cleaning text
+    def clean_text(self, text):
+        '''
+        Clean the text by removing punctuations, html tag, underscore,
+        whitespaces, numbers, stopwords.
+        Lemmatize the words in root format.
+        '''
+        # Handle non-string inputs
+        if not isinstance(text, str):
+            text = str(text) if text is not None else ""
+        if not text.strip():
+            return ""
+        try:
+            text = text.lower()
+            text = re.sub(self.currency_symbols, 'currency', text)
+            '''remove any kind of emojis in the text'''
+            emoji_pattern = re.compile("["
+                                u"\U0001F600-\U0001F64F"  # emoticons
+                                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
+                                u"\U0001F680-\U0001F6FF"  # transport & map symbols
+                                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
+                                u"\U00002702-\U000027B0"
+                                u"\U000024C2-\U0001F251"
+                                "]+", flags=re.UNICODE)
+            text = emoji_pattern.sub(r'', text)
+            text = self.remove_punctuation(text)
+            text = re.compile('<.*?>').sub('', text)
+            text = text.replace('_', '')
+            text = re.sub(r'[^\w\s]', '', text)
+            text = re.sub(r'\d', ' ', text)
+            text = re.sub(r'\s+', ' ', text).strip()
+            text = ' '.join(word for word in text.split() if word not in self.stop_words)
+            # Lemmatization with error handling
+            try:
+                text = ' '.join(self.lemmatizer.lemmatize(word) for word in text.split())
+            except (AttributeError, LookupError) as e:
+                print(f"Lemmatization failed for text: {e}")
+                # Continue without lemmatization
+                pass
+            return str(text)
+        except Exception as e:
+            print(f"Error cleaning text: {e}")
+            return str(text)
+class process:
+    def __init__(self, data_path:str, text_feature:str,target_feature:str):
+        self.data_path = Path(data_path)
+        self.text_feature = text_feature
+        self.target_feature = target_feature
+    def _read_data(self):
+        df = pd.read_csv(self.data_path)
+        return df
+    def encoder_class(self, df):
+        encoder = LabelEncoder()
+        target = encoder.fit_transform(df[self.target_feature])
+        os.makedirs("artifacts",exist_ok=True)
+        save_path = os.path.join("artifacts", 'encoder.pkl')
+        with open(save_path, 'wb') as f:
+            pickle.dump(encoder, f)
+        return target
+    def clean_text(self, df):
+        text_cleaner = TextCleaner()
+        return df[self.text_feature].apply(lambda x: text_cleaner.clean_text(x))
+    def processing(self):
+        df = self._read_data()
+        df['labeled_target'] = self.encoder_class(df)
+        print("started Cleaning")
+        df['clean_text'] = self.clean_text(df)
+        return df
+    @staticmethod
+    def split_data( X, y):
+        from sklearn.model_selection import train_test_split
+        X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
+        return X_train, X_test, y_train, y_test
+class Vectorization:
+    def __init__(self, dataframe=np.zeros((5,5)), text_feature='text_feature'):
+        self.df = dataframe
+        self.text = text_feature
+        # Define the directory where you want to save the vectorizer
+        self.vectorizer_dir = "vectorizers"
+    def TfidfVectorizer(self, eval=False, string="text", **kwargs):
+        # Step 1: Fit the Vectorizer on the Training Data
+        vectorizer = TfidfVectorizer(**kwargs)
+        if eval==True:
+            tfidf_vectorizer = utils.load_artifacts("vectorizers","tfidf_vectorizer.pkl")
+            return tfidf_vectorizer.transform([string])
+        tfidf_vectorizer = vectorizer.fit_transform(self.df[self.text])
+        print(tfidf_vectorizer.toarray().shape)
+        os.makedirs(self.vectorizer_dir,exist_ok=True)
+        save_path = os.path.join(self.vectorizer_dir, 'tfidf_vectorizer.pkl')
+        with open(save_path, 'wb') as f:
+            pickle.dump(vectorizer, f)
+        return tfidf_vectorizer
+    def CountVectorizer(self, eval=False, string="text",**kwargs):
+        # Step 1: Fit the Vectorizer on the Training Data
+        vectorizer = CountVectorizer(**kwargs)
+        if eval==True:
+            tfidf_vectorizer = utils.load_artifacts("vectorizers","count_vectorizer.pkl")
+            return tfidf_vectorizer.transform([string])
+        count_vectorizer = vectorizer.fit_transform(self.df[self.text])
+        print(count_vectorizer.toarray().shape)
+        os.makedirs(self.vectorizer_dir,exist_ok=True)
+        save_path = os.path.join(self.vectorizer_dir, 'count_vectorizer.pkl')
+        with open(save_path, 'wb') as f:
+            pickle.dump(vectorizer, f)
+        return count_vectorizer

NoCodeTextClassifier/utils.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
+import pickle
+import os
+import streamlit as st
+def load_model(model_name):
+    with open(os.path.join('models',model_name), 'rb') as f:
+        model = pickle.load(f)
+    return model
+def load_artifacts(folder_name, file_name):
+    with open(os.path.join(folder_name,file_name), 'rb') as f:
+        model = pickle.load(f)
+    return model
+def prediction(model, X_test):
+    model = load_model(model)
+    y_pred = model.predict(X_test)
+    return y_pred
+def evaluation(model, X_test, y_test):
+    y_pred = prediction(model, X_test)
+    accuracy = accuracy_score(y_test, y_pred)
+    class_report = classification_report(y_test, y_pred)
+    conf_matrix = confusion_matrix(y_test, y_pred)
+    model_name = model.split(".")[0]
+    print(f"Accuracy of {model_name}: {accuracy}\n")
+    print(f"Classification Report of {model_name} : \n{class_report}\n")
+    print(f"Confusion Matrix of {model_name} : \n{conf_matrix}")
+    st.markdown(f"Accuracy of **{model_name}**: **{accuracy*100}%**\n")
+    # st.markdown(f"\nClassification Report of **{model_name}** :\n")
+    # st.write(class_report)
+    st.markdown(f"\nConfusion Matrix of **{model_name}** : \n")
+    st.write(conf_matrix)