Spaces:

tkbarb10
/

ads505-app

Running

App Files Files Community

Taylor Kirk commited on 15 days ago

Commit

5d4981c

0 Parent(s):

Fresh deployment after moving datasets to hf datahub

Browse files

Files changed (39) hide show

.gitignore +212 -0
.python-version +1 -0
Dockerfile +27 -0
LICENSE +201 -0
README.md +8 -0
app.py +28 -0
pages/__init__.py +0 -0
pages/main.py +41 -0
pages/model.py +14 -0
pages/topic.py +39 -0
pages/user.py +19 -0
pyproject.toml +32 -0
tabs/__init__.py +3 -0
tabs/main_page_tabs/__init__.py +0 -0
tabs/main_page_tabs/bivariate_analysis.py +139 -0
tabs/main_page_tabs/dataset_overview.py +80 -0
tabs/main_page_tabs/home.py +103 -0
tabs/main_page_tabs/target_variable.py +281 -0
tabs/main_page_tabs/text_analysis.py +100 -0
tabs/main_page_tabs/univariate_analysis.py +134 -0
tabs/predictive_model_tabs/__init__.py +0 -0
tabs/predictive_model_tabs/pred_model_one.py +244 -0
tabs/predictive_model_tabs/pred_model_two.py +44 -0
tabs/user_page_tabs/__init__.py +6 -0
tabs/user_page_tabs/user_about.py +223 -0
tabs/user_page_tabs/user_chat_analysis.py +138 -0
tabs/user_page_tabs/user_pred_model.py +322 -0
tabs/user_page_tabs/user_topic_model.py +313 -0
utils/__init__.py +0 -0
utils/build_plotly.py +83 -0
utils/icons.py +143 -0
utils/load_data.py +24 -0
utils/load_pred_model.py +43 -0
utils/load_user_data.py +13 -0
utils/plot_gains.py +63 -0
utils/prepare_user_dataframe.py +234 -0
utils/remove_html.py +16 -0
utils/topically.py +163 -0
uv.lock +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,212 @@

+# Custom ignore
+models/sgdc_pipeline.joblib
+review_data/
+models/demo_data.parquet
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+#poetry.toml
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+#pdm.lock
+#pdm.toml
+.pdm-python
+.pdm-build/
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+#pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Abstra
+# Abstra is an AI-powered process automation framework.
+# Ignore directories containing user credentials, local state, and settings.
+# Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#  and can be added to the global gitignore or merged into this file. However, if you prefer,
+#  you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Cursor
+#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
+#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
+#  refer to https://docs.cursor.com/context/ignore-files
+.cursorignore
+.cursorindexingignore
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.12

Dockerfile ADDED Viewed

	@@ -0,0 +1,27 @@

+# 1. Match your local Python version
+FROM python:3.12-slim
+# 2. Install uv
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
+WORKDIR /app
+# 3. Install system dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+# 4. Copy and install dependencies
+COPY pyproject.toml uv.lock ./
+RUN uv pip install --system --no-cache -r pyproject.toml
+# 5. Pre-download NLTK data (Matches your app.py list)
+RUN python3 -m nltk.downloader stopwords wordnet omw-1.4 punkt_tab averaged_perceptron_tagger_eng
+# 6. Copy the rest of the code
+COPY . .
+EXPOSE 7860
+ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=7860", "--server.address=0.0.0.0"]

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,8 @@

+---
+title: ADS 505 Review Analytics
+emoji: 🌐
+colorFrom: blue
+colorTo: green
+sdk: docker
+pinned: false
+---

app.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import streamlit as st
+import nltk
+@st.cache_resource
+def setup_nltk():
+    resources = [
+        ('corpora/stopwords', 'stopwords'),
+        ('corpora/wordnet', 'wordnet'),
+        ('corpora/omw-1.4', 'omw-1.4'),
+        ('tokenizers/punkt_tab', 'punkt_tab'),
+        ('taggers/averaged_perceptron_tagger_eng', 'averaged_perceptron_tagger_eng')
+    ]
+    for resource_path, package_name in resources:
+        try:
+            nltk.data.find(resource_path)
+        except LookupError:
+            nltk.download(package_name)
+setup_nltk()
+main_page = st.Page("pages/main.py", title="Home", icon="🏠")
+topic_model = st.Page("pages/topic.py", title="Topic Modeling", icon="🌐")
+modeling = st.Page("pages/model.py", title='Predictive Modeling', icon='🎱')
+user = st.Page("pages/user.py", title='User Page', icon="👤")
+pg = st.navigation({"Pages": [main_page, topic_model, modeling, user]})
+pg.run()

pages/__init__.py ADDED Viewed

File without changes

pages/main.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import streamlit as st
+from tabs.main_page_tabs.dataset_overview import render as render_overview
+from tabs.main_page_tabs.univariate_analysis import render as render_uni
+from tabs.main_page_tabs.bivariate_analysis import render as render_bi
+from tabs.main_page_tabs.target_variable import render as render_target
+from tabs.main_page_tabs.home import render as render_home
+from tabs.main_page_tabs.text_analysis import render as render_text
+# Page Title
+st.set_page_config(
+    page_title='Amazon Reviews',
+    layout='wide',
+    page_icon=":panda_face:"
+    )
+st.title('Explore the wild world of Amazon Reviews')
+tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs(["About", 'Dataset Overview', 'Univariate Analysis', 'Bivariate Analysis', 'Target', 'Text Analysis'])
+with tab1:
+    render_home()
+with tab2:
+    render_overview()
+if st.session_state.explore_df is not None:
+    with tab3:
+        render_uni(st.session_state.explore_df)
+    with tab4:
+        render_bi(st.session_state.explore_df)
+    with tab5:
+        render_target(st.session_state.explore_df)
+    with tab6:
+        render_text(st.session_state.explore_df)

pages/model.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import streamlit as st
+from tabs.predictive_model_tabs.pred_model_one import render as render_about
+from tabs.predictive_model_tabs.pred_model_two import render as render_demo
+tab1, tab2 = st.tabs(['🤷‍♂️ About', ':star2: Demo'])
+with tab1:
+    render_about()
+with tab2:
+    render_demo()

pages/topic.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import streamlit as st
+from utils.topically import make_topics
+st.set_page_config(layout="wide")
+DATA_OPTIONS = {
+    'Beauty': 'All_Beauty',
+    'Appliances': 'Appliances',
+    'Baby Products': 'Baby_Products',
+    'Electronics': 'Electronics',
+    'Health and Household': 'Health_and_Household',
+    'Movies and TV': 'Movies_and_TV'
+}
+st.markdown("# Topic Modeling")
+cat = st.sidebar.selectbox(
+    "Choose the dataset to model",
+    tuple(DATA_OPTIONS.keys()),
+    index=None
+)
+column = st.sidebar.selectbox("Choose a column to model", ("Text", "Title", "Both"), index=None)
+if cat and column:
+    category = DATA_OPTIONS[cat]
+    topic_pipeline, fig = make_topics(
+        category=category,
+        topic_columns=column,
+        lemmatize=True,   # or False
+        n1=2,
+        n2=3,
+        n_components=5,
+        rating=[1, 2],    # optional
+        helpful_vote=0,   # optional
+        new_words=None,
+        n_top_words=5,
+        # data_dir="path/to/review_data"  # optional override if needed
+    )
+    st.plotly_chart(fig, use_container_width=True, config={"scrollZoom": True})

pages/user.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import streamlit as st
+from tabs import render_about
+from tabs import render_topic
+from tabs import render_pred
+from tabs import render_analysis
+tab1, tab2, tab3, tab4 = st.tabs(['🤷‍♂️ About', ':star2: Topic', '▶️ Predictive', "💬 Chat Analysis"])
+with tab1:
+    render_about()
+with tab2:
+    render_topic()
+with tab3:
+    render_pred()
+with tab4:
+    render_analysis()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,32 @@

+[project]
+name = "final-project"
+version = "0.1.0"
+description = "Analyzing customer review data from Amazon for final project for ADS505"
+authors = [
+    {name = "Taylor Kirk",email = "tkirk@sandiego.edu"},
+    {name = "Sushama Kafle",email = "skafle@sandiego.edu"},
+    {name = "Luigi Salemi",email = "lsalemi@sandiego.edu"}
+]
+license = {text = "Apache 2.0"}
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+    "requests (>=2.32.5,<3.0.0)",
+    "pandas (>=2.3.2,<3.0.0)",
+    "scipy (>=1.16.2,<2.0.0)",
+    "scikit-learn (>=1.7.2,<2.0.0)",
+    "matplotlib (>=3.10.6,<4.0.0)",
+    "plotly (>=6.3.0,<7.0.0)",
+    "ipykernel (>=6.30.1,<7.0.0)",
+    "nbformat (>=5.10.4,<6.0.0)",
+    "nltk",
+    "streamlit",
+    "seaborn",
+    "emoji",
+    "openai>=2.13.0",
+    "wordcloud>=1.9.4",
+    "textstat>=0.7.12",
+    "pathlib>=1.0.1",
+    "joblib>=1.5.2",
+]

tabs/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # Tabs module for Streamlit EDA app
2	+
3	+ from .user_page_tabs import *

tabs/main_page_tabs/__init__.py ADDED Viewed

File without changes

tabs/main_page_tabs/bivariate_analysis.py ADDED Viewed

	@@ -0,0 +1,139 @@

+"""
+Bivariate Analysis Tab
+This tab displays relationships between variables:
+- Scatter plots (e.g., Price vs Average Rating)
+- Correlation heatmap (triangular, using Plotly)
+"""
+import streamlit as st
+import pandas as pd
+import numpy as np
+import plotly.express as px
+import plotly.graph_objects as go
+from utils.icons import lucide_icon
+def render(df):
+    """
+    Render the Bivariate Analysis tab.
+    Args:
+        df (pd.DataFrame): The main dataset
+    """
+    st.markdown(
+        f'<h2 class="section-header icon-header">{lucide_icon("git-merge", size=28)} Bivariate Analysis</h2>',
+        unsafe_allow_html=True
+    )
+    with st.form('Bivariate Form'):
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            st.subheader("Select First Column")
+            first_choice = st.selectbox(
+                "First Column",
+                options=df.columns,
+                key='first_col_select',
+                index=None,
+                placeholder='--Select Column--'
+            )
+        with col2:
+            st.subheader("Select Second Column")
+            second_choice = st.selectbox(
+                "Second Column",
+                options=df.columns,
+                key='second_col_select',
+                index=None,
+                placeholder='--Select Column--'
+            )
+        with col3:
+            st.subheader("Plot Choice")
+            plot_choice = st.selectbox(
+                'Select Plot',
+                ['Scatter', 'Correlation'],
+                index=None,
+                placeholder="--Select Plot Type--"
+            )
+        submitted = st.form_submit_button("Plot Away")
+    # Scatter Plot
+    if plot_choice == 'Scatter':
+        scatter_fig = px.scatter(
+            df,
+            x=first_choice,
+            y=second_choice,
+            title=f'<b>Compare {first_choice} and {second_choice}</b>',
+            color_discrete_sequence=['darkgreen'],
+            opacity=0.6
+        )
+        st.plotly_chart(scatter_fig, use_container_width=True)
+    # Correlation Analysis
+    if plot_choice == 'Correlation':
+        st.markdown(
+            f"<h3>{lucide_icon('link', size=20)} Correlation Analysis<h3>",
+            unsafe_allow_html=True
+        )
+        corr_matrix = df.loc[:, [first_choice, second_choice]].corr()
+        # Create mask for upper triangle (including diagonal)
+        mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
+        # Apply mask - set upper triangle to NaN
+        corr_masked = corr_matrix.mask(mask)
+        # Create custom text for annotations (only show values for lower triangle)
+        text_values = []
+        for i in range(len(corr_matrix)):
+            row_text = []
+            for j in range(len(corr_matrix)):
+                if i > j:  # Lower triangle only
+                    row_text.append(f"{corr_matrix.iloc[i, j]:.2f}")
+                else:
+                    row_text.append("")
+            text_values.append(row_text)
+        # Create Plotly heatmap
+        fig_corr = go.Figure(data=go.Heatmap(
+            z=corr_masked.values,
+            x=corr_masked.columns,
+            y=corr_masked.index,
+            colorscale='RdBu_r',
+            zmid=0,
+            zmin=-1,
+            zmax=1,
+            text=text_values,
+            texttemplate='%{text}',
+            textfont={"size": 12},
+            colorbar=dict(
+                title=dict(text="Correlation", side="right"),
+                tickmode="linear",
+                tick0=-1,
+                dtick=0.2
+            ),
+            hoverongaps=False,
+            hovertemplate='%{y} vs %{x}<br>Correlation: %{z:.3f}<extra></extra>'
+        ))
+        fig_corr.update_layout(
+            title='<b>Correlation Matrix (Lower Triangle)</b>',
+            xaxis_title="",
+            yaxis_title="",
+            xaxis={'side': 'bottom'},
+            yaxis={'autorange': 'reversed'},
+            width=700,
+            height=600,
+            plot_bgcolor='rgba(0,0,0,0)',
+            paper_bgcolor='rgba(0,0,0,0)'
+        )
+        # Update axes to show all labels
+        fig_corr.update_xaxes(tickangle=45)
+        st.plotly_chart(fig_corr, width='stretch')

tabs/main_page_tabs/dataset_overview.py ADDED Viewed

	@@ -0,0 +1,80 @@

+"""
+Dataset Overview Tab
+This tab displays basic information about the dataset including:
+- First few rows of data
+- Data types and missing values
+- Summary statistics
+"""
+import streamlit as st
+import pandas as pd
+import os
+from pathlib import Path
+from utils.icons import lucide_icon
+from utils.load_data import load_dataset
+def find_data_dir():
+    current = Path(__file__).resolve()
+    for parent in current.parents:
+        if (parent / "review_data").exists():
+            return parent / "review_data"
+    return None
+DATA_DIR = find_data_dir()
+def render():
+    st.sidebar.header('Data')
+    # Convert Path object back to string for os.path functions if needed
+    if not os.path.isdir(str(DATA_DIR)):
+        st.sidebar.error(f"Missing folder. Currently looking at: {DATA_DIR}")
+    else:
+        files = sorted([f for f in os.listdir(DATA_DIR) if f.lower().endswith((".csv", ".parquet"))])
+        if files:
+            selected = st.sidebar.selectbox("Choose a dataset", ("-- Select a category --", 'Beauty', 'Appliances', 'Baby Products', 'Electronics', 'Health and Household', 'Movies and TV'))
+            if selected == "-- Select a category --":
+                st.stop()
+            else:
+                st.sidebar.write("You selected:", selected)
+                df = load_dataset(DATA_DIR, selected) #type: ignore
+                df.drop(columns=['images', 'asin', 'parent_asin', 'user_id'], inplace=True, errors='ignore')
+        else:
+            st.sidebar.warning("No .csv or .parquet files found in review_data/.")
+    st.markdown(
+        f'<h2 class="section-header icon-header">{lucide_icon("layout-dashboard", size=28)} Dataset Overview</h2>',
+        unsafe_allow_html=True
+    )
+    # Basic Information
+    st.markdown(
+        f'<h3>{lucide_icon("table", size=20)} Basic Information</h3>',
+        unsafe_allow_html=True
+        )
+    st.dataframe(df.head(), use_container_width=True)
+    # Data Types & Missing Values
+    st.markdown(
+        f"<h3>{lucide_icon('info', size=20)} Data Types & Missing Values<h3>",
+        unsafe_allow_html=True
+    )
+    info_df = pd.DataFrame({
+        'Column': df.columns,
+        'Data Type': df.dtypes.astype(str),
+        'Non-Null Count': df.count(),
+        'Missing Values': df.isnull().sum(),
+        'Missing %': (df.isnull().sum() / len(df) * 100).round(2)
+    })
+    st.dataframe(info_df, use_container_width=True)
+    # Summary statistics
+    st.markdown(
+        f"<h3>{lucide_icon('calculator', size=20)} Summary Statistics<h3>",
+        unsafe_allow_html=True
+    )
+    st.dataframe(df.describe(), use_container_width=True)
+    st.session_state.explore_df = df

tabs/main_page_tabs/home.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import streamlit as st
+def render():
+    st.markdown("""
+    <style>
+    @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;600&display=swap');
+    .custom-section {
+        font-family: 'Inter', sans-serif;
+        font-size: 16px;
+        line-height: 1.4;
+    }
+    .custom-section strong {
+        font-weight: 600;
+        color: #2E86AB;
+        display: inline-block;
+        margin-bottom: 8px;
+    }
+    </style>
+    """, unsafe_allow_html=True)
+    st.markdown("## :earth_asia: Navigation")
+    st.markdown("""
+    <div class="custom-section">
+    This home page outlines our project of using <a href="https://huggingface.co/datasets/McAuley-Lab/Amazon-Reviews-2023/tree/main">Amazon review data</a> to understand consumer behavior and help businesses improve their products and services. Select a dataset from the sidebar and use the tabs above to explore the data.
+    </div>
+    """, unsafe_allow_html=True)
+    st.markdown("\n")
+    st.info("""
+    **Available Tabs:**
+    - **Dataset Overview** — Basic information and summary statistics
+    - **Univariate Analysis** — Visualize individual columns with various plots
+    - **Bivariate Analysis** — Compare relationships between two columns
+    - **Target** — Analyze the target variable (helpful_votes)
+    - **Text Analysis** — Examine review titles and text content
+    """)
+    # New section for page navigation
+    st.markdown("### 📑 App Pages")
+    st.markdown("""
+    <div class="custom-section">
+    <strong>🏠 About (Current Page)</strong><br>
+    Learn about the project, explore our Amazon review datasets, and understand the data through various analyses.
+    <br><br>
+    <strong>🔍 Topic Modeling</strong><br>
+    Apply NMF [Non-negative Matrix Factorization](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html) topic modeling to discover themes in reviews. Choose a dataset and text column, then visualize the extracted topics and their distributions.
+    <br><br>
+    <strong>🤖 Predictive Modeling</strong><br>
+    Explore our machine learning model that predicts review helpfulness. See a live demo of how the model works and what features drive predictions.
+    <br><br>
+    <strong>👤 User Workspace</strong><br>
+    Your personal analysis environment where you can:
+    <ul>
+    <li>Upload your own customer review data</li>
+    <li>Explore and visualize</li>
+    <li>Apply topic modeling to your reviews</li>
+    <li>Use our predictive model on your data</li>
+    <li>Chat with an AI assistant to interpret results and gain insights</li>
+    </ul>
+    </div>
+    """, unsafe_allow_html=True)
+    st.divider()
+    st.markdown("## :blue[:material/description:] ADS505 Project Description")
+    st.markdown("""
+    <div class="custom-section">
+    This project analyzes Amazon customer review data to identify what makes a review helpful.
+    Using data science and machine learning, we build predictive models to surface the most helpful reviews and apply topic modeling to
+    uncover the key themes and characteristics of helpful reviews across product categories and ratings.
+    </div>
+    """, unsafe_allow_html=True)
+    st.divider()
+    st.markdown('## 🚩 Problem Statement')
+    st.markdown("""
+    <div class="custom-section">
+    <strong>What characteristics make an Amazon product review helpful?</strong><br>
+    By analyzing large-scale review datasets, this project identifies linguistic, structural, and contextual
+    features that correlate with helpfulness votes.
+    <br><br>
+    <div style="margin-top: 0px;">
+    <strong style="margin-bottom: 8px;">Our Goals:</strong>
+    <ol style="margin: 0; padding-left: 20px;">
+    <li><strong>Predictive Modeling</strong> — Develop models to identify reviews most likely to be found helpful</li>
+    <li><strong>Topic Analysis</strong> — Understand how helpful review characteristics differ by product category and rating</li>
+    </ol>
+    </div>
+    <div style="margin-top: 16px;">
+        <strong style="margin-bottom: 8px;">Impact:</strong>
+        Help users write better reviews, enable companies to highlight valuable feedback,
+        and provide insights for product and system improvements.
+    </div>
+    </div>
+    """, unsafe_allow_html=True)

tabs/main_page_tabs/target_variable.py ADDED Viewed

	@@ -0,0 +1,281 @@

+"""
+Target Variable Analysis Tab
+This tab provides detailed analysis of the target variable (helpful_vote):
+- Distribution visualizations (raw and log scale)
+- Helpfulness categories (Not Helpful vs Helpful)
+- Category statistics
+- Advanced analysis (box plots, cumulative distribution)
+- Detailed vote count distribution
+"""
+import streamlit as st
+import pandas as pd
+import numpy as np
+import plotly.express as px
+import plotly.graph_objects as go
+from utils.icons import lucide_icon
+def create_helpfulness_categories(df):
+    """
+    Create binary categories: helpful vs not helpful.
+    Args:
+        df (pd.DataFrame): The main dataset
+    Returns:
+        tuple: (df with new category column, bin_info dict)
+    """
+    helpful_votes = df['helpful_vote']
+    # Simple binary classification
+    # 0 votes = Not Helpful (no one found it helpful)
+    # 1+ votes = Helpful (at least one person found it helpful)
+    def categorize_helpfulness(votes):
+        if votes == 0:
+            return "Not Helpful (0 votes)"
+        else:
+            return "Helpful (1+ votes)"
+    df['helpfulness_category'] = helpful_votes.apply(categorize_helpfulness)
+    # Store bin info for display
+    bin_info = {
+        'Not Helpful (0 votes)': "Reviews that received 0 helpful votes",
+        'Helpful (1+ votes)': "Reviews that received 1 or more helpful votes"
+    }
+    return df, bin_info
+def render(df):
+    """
+    Render the Target Variable Analysis tab.
+    Args:
+        df (pd.DataFrame): The main dataset
+    """
+    st.markdown(
+        f'<h2 class="section-header icon-header">{lucide_icon("target", size=28)} Target Variable Analysis</h2>',
+        unsafe_allow_html=True
+    )
+    st.info(f"""
+        **Classification Logic:**
+        - **Not Helpful (0 votes)**: Reviews that received 0 helpful votes - no one found them helpful
+        - **Helpful (1+ votes)**: Reviews that received 1 or more helpful votes - at least one person found them helpful
+        """)
+    st.markdown("\n")
+    if 'helpful_vote' in df.columns:
+        # Apply categorization
+        df_with_categories, bin_info = create_helpfulness_categories(df)
+        # Display bin information
+        st.markdown(
+            f"""
+            <h3 style="bottom-margin: -10px;"{lucide_icon('tags', size=20)} Helpfulness Categories
+            </h3>
+            """,
+            unsafe_allow_html=True
+        )
+        # Row 1: Distribution Analysis
+        col1, col2 = st.columns(2)
+        with col1:
+            # Original distribution
+            fig_target = px.histogram(
+                df,
+                x='helpful_vote',
+                title='<b>Distribution of Helpful Votes (Raw)</b>',
+                color_discrete_sequence=['#e74c3c'],
+                nbins=50
+            )
+            fig_target.update_layout(
+                xaxis_title="Number of Helpful Votes",
+                yaxis_title="Frequency"
+            )
+            st.plotly_chart(fig_target, use_container_width=True)
+        with col2:
+            # Log-scale distribution for better visualization
+            df_nonzero = df[df['helpful_vote'] > 0]
+            if len(df_nonzero) > 0:
+                fig_log = px.histogram(
+                    df_nonzero,
+                    x='helpful_vote',
+                    title='<b>Distribution of Helpful Votes (Log Scale)</b>',
+                    color_discrete_sequence=['#9b59b6'],
+                    nbins=30,
+                    log_y=True
+                )
+                fig_log.update_layout(
+                    xaxis_title="Number of Helpful Votes",
+                    yaxis_title="Frequency (Log Scale)"
+                )
+                st.plotly_chart(fig_log, use_container_width=True)
+            else:
+                st.markdown(
+                    f"{lucide_icon('info', size=16)} No non-zero helpful votes to display in log scale",
+                    unsafe_allow_html=True
+                )
+        st.markdown("\n")
+        # Row 2: Categorical Analysis
+        col1, col2 = st.columns([.6, .4], border=True)
+        with col1:
+            # Categorical distribution - using Luigi's bar chart style
+            category_counts = df_with_categories['helpfulness_category'].value_counts().reset_index()
+            category_counts.columns = ['Category', 'Count']
+            fig_categories = px.bar(
+                category_counts,
+                x='Count',
+                y='Category',
+                orientation='h',
+                title='<b>Distribution by Helpfulness Category</b>',
+                color_discrete_sequence=['#184A90'],
+                text='Count'
+            )
+            fig_categories.update_layout(
+                margin=dict(l=200, r=20, t=50, b=20),
+                yaxis={'categoryorder':'total ascending', 'title': ''},
+                xaxis_title='Count'
+            )
+            fig_categories.update_traces(texttemplate='%{text}', textposition='outside')
+            st.plotly_chart(fig_categories, use_container_width=True)
+        with col2:
+            # Category statistics
+            st.markdown(
+                f"""
+                <h3 style="margin-bottom: -15px;">{lucide_icon('target', size=20)} Category Statistics
+                </h3>
+                """,
+                unsafe_allow_html=True
+            )
+            category_stats = []
+            for category in ['Not Helpful (0 votes)', 'Helpful (1+ votes)']:
+                count = (df_with_categories['helpfulness_category'] == category).sum()
+                percentage = (count / len(df_with_categories)) * 100
+                category_stats.append({
+                    'Category': category,
+                    'Count': f"{count:,}",
+                    'Percentage': f"{percentage:.1f}%"
+                })
+            stats_df = pd.DataFrame(category_stats)
+            st.dataframe(stats_df, width='stretch', hide_index=True)
+            # Overall statistics
+            st.markdown(
+                f"""
+                <h3 style="margin-bottom: -15px;">{lucide_icon('trending-up', size=20)} Overall Statistics
+                </h3>
+                """,
+                unsafe_allow_html=True
+            )
+            overall_stats = {
+                'Metric': ['Total Reviews', 'Mean Votes', 'Median Votes', 'Std Dev', 'Max Votes'],
+                'Value': [
+                    f"{df['helpful_vote'].count():,}",
+                    f"{df['helpful_vote'].mean():.2f}",
+                    f"{df['helpful_vote'].median():.2f}",
+                    f"{df['helpful_vote'].std():.2f}",
+                    f"{df['helpful_vote'].max():,}"
+                ]
+            }
+            overall_df = pd.DataFrame(overall_stats)
+            st.dataframe(overall_df, width='stretch', hide_index=True)
+        st.markdown("\n")
+        # Row 3: Advanced Analysis
+        st.markdown(
+            f"""
+            <h3 style="margin-bottom: -10px;>{lucide_icon('search', size=20)} Advanced Target Variable Analysis
+            </h3>
+            """,
+            unsafe_allow_html=True
+        )
+        col1, col2 = st.columns(2)
+        with col1:
+            # Box plot by category
+            fig_box = px.box(
+                df_with_categories,
+                x='helpfulness_category',
+                y='helpful_vote',
+                title='<b>Helpful Votes Distribution by Category</b>',
+                color_discrete_sequence=['#184A90']
+            )
+            fig_box.update_layout(
+                xaxis_title="Helpfulness Category",
+                yaxis_title="Number of Helpful Votes",
+                showlegend=False
+            )
+            st.plotly_chart(fig_box, use_container_width=True)
+        with col2:
+            # Cumulative distribution
+            sorted_votes = np.sort(df['helpful_vote'])
+            cumulative_pct = np.arange(1, len(sorted_votes) + 1) / len(sorted_votes) * 100
+            fig_cumulative = go.Figure()
+            fig_cumulative.add_trace(go.Scatter(
+                x=sorted_votes,
+                y=cumulative_pct,
+                mode='lines',
+                name='Cumulative %',
+                line=dict(color='#2c3e50', width=2)
+            ))
+            fig_cumulative.update_layout(
+                title='<b>Cumulative Distribution of Helpful Votes</b>',
+                xaxis_title='Number of Helpful Votes',
+                yaxis_title='Cumulative Percentage (%)',
+                showlegend=False
+            )
+            st.plotly_chart(fig_cumulative, use_container_width=True)
+        # Value counts for helpful votes (moved to bottom)
+        st.markdown("\n")
+        st.markdown(
+            f"""
+            <h3 style="margin-bottom: -10px;"{lucide_icon('hash', size=20)} Detailed Helpful Votes Distribution (Top 20)
+            </h3>
+            """,
+            unsafe_allow_html=True
+        )
+        # Filter out 0 votes and get top 20
+        value_counts = df[df['helpful_vote'] > 0]['helpful_vote'].value_counts().head(20).reset_index()
+        value_counts.columns = ['Helpful Votes', 'Count']
+        fig_counts = px.bar(
+            value_counts,
+            x='Helpful Votes',
+            y='Count',
+            title='<b>Top 20 Most Common Helpful Vote Counts (Excluding 0)</b>',
+            color_discrete_sequence=['#184A90'],
+            text='Count'
+        )
+        fig_counts.update_traces(texttemplate='%{text}', textposition='outside')
+        fig_counts.update_layout(
+            xaxis_title="Number of Helpful Votes",
+            yaxis_title="Frequency"
+        )
+        st.plotly_chart(fig_counts, use_container_width=True)
+    else:
+        st.warning(
+            f"{lucide_icon('alert-triangle', size=16)} Target variable 'helpful_vote' not found in the dataset.",
+            #unsafe_allow_html=True
+        )

tabs/main_page_tabs/text_analysis.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+import plotly.express as px
+import plotly.graph_objects as go
+from utils.icons import lucide_icon
+from wordcloud import WordCloud
+import textstat
+@st.cache_data(show_spinner='Processing Text Analysis...', show_time=True)
+def get_text_analysis_data(_df, text_col):
+    """
+    Caches both the wordcloud image and the text statistics.
+    The underscore '_' in _df prevents Streamlit from hashing the entire dataframe.
+    """
+    # Clean the text once
+    text_series = _df[text_col].dropna().astype(str)
+    full_text = " ".join(text_series)
+    # Generate WordCloud
+    wc = WordCloud(
+        width=600,
+        height=300,
+        background_color='white'
+    ).generate(full_text)
+    # Calculate textstat metrics (expensive operations)
+    stats = {
+        "avg_len": text_series.str.len().mean(),
+        "fk_grade": text_series.apply(textstat.flesch_kincaid_grade).mean(), # type: ignore
+        "s_count": text_series.apply(textstat.syllable_count).mean(), # type: ignore
+        "l_count": text_series.apply(textstat.lexicon_count).mean(), # type: ignore
+        "wc_image": wc.to_array()
+    }
+    return stats
+def render(df):
+    st.markdown(
+        f'<h2 class="section-header icon-header">{lucide_icon("bar-chart-2", size=28)} Text Analysis</h2>',
+        unsafe_allow_html=True
+    )
+    text_col = st.selectbox(
+        "Select a text column",
+        options=df.columns,
+        index=None,
+        placeholder="--Select Text Column--"
+    )
+    if not text_col:
+        st.stop()
+    # Retrieve all cached results at once
+    analysis = get_text_analysis_data(df, text_col)
+    # Layout Metrics
+    col_a, col_b, col_c, col_d = st.columns(4)
+    col_a.metric(f'Avg {text_col.title()} Length', f"{analysis['avg_len']:.2f}")
+    col_b.metric('Avg Grade Level', f"{analysis['fk_grade']:.0f}")
+    col_c.metric('Avg Syllable Count', f"{analysis['s_count']:.2f}")
+    col_d.metric('Avg Word Count', f"{analysis['l_count']:.2f}")
+    st.markdown("\n")
+    # Display cached WordCloud image
+    st.image(analysis['wc_image'], width="content", caption="Image courtesy of you")
+    st.markdown("\n")
+    top_10 = df[text_col].value_counts().nlargest(10).reset_index()
+    top_10.columns = [text_col, 'count']
+    cat_fig = px.bar(
+        top_10,
+        x='count',
+        y=text_col,
+        orientation='h',
+        title=f'<b>Top 10 Categories for {text_col.title()}</b>',
+        color_discrete_sequence=['#184A90'],
+        text='count'
+    )
+    cat_fig.update_layout(
+        margin=dict(l=100, r=20, t=50, b=20),
+        yaxis={'categoryorder':'total ascending', 'title': ''},
+        xaxis_title='Count',
+        autosize=True
+    )
+    cat_fig.update_traces(texttemplate='%{text}', textposition='outside')
+    # The "Modern" way to call it:
+    st.plotly_chart(
+        cat_fig,
+        use_container_width=True,
+        config={'displayModeBar': False} # Example of using the 'config' dict
+)

tabs/main_page_tabs/univariate_analysis.py ADDED Viewed

	@@ -0,0 +1,134 @@

+"""
+Univariate Analysis Tab
+This tab displays distributions of individual variables:
+- Numerical variables: Histograms in a grid layout
+- Categorical variables: Bar charts for each category
+"""
+import streamlit as st
+import plotly.express as px
+import plotly.graph_objects as go
+from utils.icons import lucide_icon
+def render(df):
+    """
+    Render the Univariate Analysis tab.
+    Args:
+        df (pd.DataFrame): The main dataset
+    """
+    st.markdown(
+        f'<h2 class="section-header icon-header">{lucide_icon("bar-chart-2", size=28)} Univariate Analysis</h2>',
+        unsafe_allow_html=True
+    )
+    with st.form('Univariate Form'):
+        col1, col2 = st.columns(2)
+        col1.subheader('Select Column')
+        explore_column = col1.selectbox(
+            "Choose a column",
+            list(df.columns),
+            index=None,
+            key='explore_column',
+            placeholder="-- Select to choose --"
+        )
+        col2.subheader('Select Plot Type')
+        column_plot = col2.selectbox(
+            "Choose a plot type",
+            ['Histogram', "Bar", "Box"],
+            index=None,
+            placeholder="-- Select to choose --"
+        )
+        submitted = st.form_submit_button('Plot Away')
+    # Histogram
+    if column_plot == "Histogram":
+        fig_hist = go.Figure()
+        upper_limit = df[explore_column].quantile(0.99)
+        df_filtered = df[df[explore_column] <= upper_limit]
+        num_outliers = (df[explore_column] > upper_limit).sum()
+        outlier_percent = (num_outliers / len(df)) / 100
+        fig_hist.add_trace(
+            go.Histogram(
+                x=df_filtered[explore_column],
+                nbinsx=30,
+                name=explore_column,
+                showlegend=False,
+                marker = dict(
+                    color='#184A90',
+                    line=dict(color='white')
+                )
+            ),
+        )
+        fig_hist.add_annotation(
+            xref="paper", yref="paper",     # position relative to the figure
+            x=0.98, y=1.05,
+            text="ℹ️ Outlier Info",
+            showarrow=False,
+            font=dict(size=12, color="gray"),
+            hovertext="Outliers are filtered above the 99th percentile.<br><br>"f"Threshold: {upper_limit:.2f}<br>"f"Number: {num_outliers}<br>"f"Percent of Data: {outlier_percent:.2f}%",
+            hoverlabel=dict(bgcolor="white"),
+        )
+        fig_hist.update_layout(
+            title_text=f"<b>Distributions of {explore_column}</b>",
+            height=400,
+            showlegend=False
+        )
+        st.plotly_chart(fig_hist, use_container_width=True)
+    # Bar chat
+    if column_plot == "Bar":
+        if df[explore_column].nunique() > 10:
+            # Show top 10 categories for variables with many categories
+            top_10 = df[explore_column].value_counts().nlargest(10).reset_index()
+            top_10.columns = [explore_column, 'count']
+            cat_fig = px.bar(
+                top_10,
+                x='count',
+                y=explore_column,
+                orientation='h',
+                title=f'<b>Top 10 Categories for {explore_column}</b>',
+                color_discrete_sequence=['#184A90'],
+                text='count'
+            )
+            cat_fig.update_layout(
+                margin=dict(l=250, r=20, t=50, b=20),
+                yaxis={'categoryorder':'total ascending', 'title': ''},
+                xaxis_title='Count'
+            )
+            cat_fig.update_traces(texttemplate='%{text}', textposition='outside')
+        else:
+            # Regular histogram for variables with few categories
+            cat_fig = px.histogram(
+                df,
+                x=explore_column,
+                title=f'<b>Distribution of {explore_column}</b>',
+                color_discrete_sequence=['#184A90']
+            )
+            cat_fig.update_xaxes(categoryorder="total descending")
+        st.plotly_chart(cat_fig, use_container_width=True)
+    # Box Plot
+    if column_plot == "Box":
+        box_fig = px.box(
+            df,
+            x=explore_column,
+            orientation='v',
+            title=f'<b>Box plot for {explore_column}</b>'
+        )
+        st.plotly_chart(box_fig, use_container_width=True)

tabs/predictive_model_tabs/__init__.py ADDED Viewed

File without changes

tabs/predictive_model_tabs/pred_model_one.py ADDED Viewed

	@@ -0,0 +1,244 @@

+import streamlit as st
+# include expected schema for uploaded data
+def render():
+    st.markdown("""
+    <style>
+    @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;600&display=swap');
+    .custom-section {
+        font-family: 'Inter', sans-serif;
+        font-size: 16px;
+        line-height: 1.4;
+    }
+    .custom-section strong {
+        font-weight: 600;
+        color: #2E86AB;
+        display: inline-block;
+        margin-bottom: 8px;
+    }
+    .custom-section ol,
+    .custom-section ul {
+        margin-top: 0 !important;
+        margin-bottom: 10px !important;
+        padding-top:  !important;
+    }
+    </style>
+    """, unsafe_allow_html=True)
+    st.markdown("## 🤖 About Predictive Modeling")
+    st.markdown("""
+    <div class="custom-section">
+    <strong>Project Goal</strong><br>
+    Our objective was to build a model that predicts which customer reviews will be found helpful by others. By identifying helpful negative reviews, we can surface potential product or service issues worth investigating. Similarly, elevating helpful positive reviews highlights what customers value most.
+    <br><br>
+    <strong>Secondary Benefit</strong><br>
+    Understanding the characteristics of helpful reviews enables reviewers to improve the quality of their feedback, making it more valuable for both businesses and consumers.
+    <strong>What You'll Find Below</strong>
+    <ul>
+    <li><strong>Data Schema</strong> — Required format and fields needed to run predictions</li>
+    <li><strong>Modeling Process</strong> — Step-by-step explanation of how the model works</li>
+    <li><strong>Interactive Demo (Tab 2)</strong> — Hands-on walkthrough before applying the model to your own data on the User Page</li>
+    </ul>
+    </div>
+    """, unsafe_allow_html=True)
+    #st.divider()
+    st.markdown("""
+    <hr style='
+        border: none;
+        height: 2px;
+        background: linear-gradient(to right, #2E86AB, #87ceeb, #2E86AB);
+        margin: 20px 0;
+    '>
+    """, unsafe_allow_html=True)
+    st.markdown("## Data Schema")
+    st.markdown("""
+    | Column Name | Data Type | Description |
+    |------------|-----------|-------------|
+    | `lemma_title` | string | Lemmatized version of the review title |
+    | `lemma_text` | string | Lemmatized version of the review text |
+    | `images` | boolen | Binary indicator if the review includes an image or not |
+    | `Review Length` | integer | Character count of the review text |
+    | `Title Length` | integer | Character count of the review title |
+    *Read more about lemmatization and the process used in our models [here](https://www.geeksforgeeks.org/python/python-lemmatization-with-nltk/)*
+    """)
+    st.markdown("""
+    <hr style='
+        border: none;
+        height: 2px;
+        background: linear-gradient(to right, #2E86AB, #87ceeb, #2E86AB);
+        margin: 20px 0;
+    '>
+    """, unsafe_allow_html=True)
+    st.markdown("## Model Components")
+    st.markdown("""
+    <div class="custom-section">
+    Our model uses a <strong>four-stage pipeline</strong> to predict review helpfulness. We trained on 60,000+ reviews
+    and achieved <strong>71.7% accuracy</strong> and <strong>63.6% F1-macro score</strong>. The model outputs probability scores,
+    allowing you to rank and prioritize reviews that are most likely to be found helpful by customers.
+    </div><br>
+    """, unsafe_allow_html=True)
+    st.markdown("### The Pipeline")
+    st.markdown("""
+    <div class="custom-section">
+    <strong>1. TF-IDF Vectorization</strong> — Extracting meaningful text patterns<br>
+    We transform `lemma_title` and `lemma_text` into numerical features using Term Frequency-Inverse Document Frequency (TF-IDF).
+    This approach identifies words and phrases that distinguish helpful reviews from unhelpful ones by balancing how often a term
+    appears in a specific review against how common it is across all reviews. Words that appear frequently in helpful reviews but
+    rarely elsewhere receive higher weights, making them strong predictive signals.
+    <br>
+    <strong>Why TF-IDF?</strong> It automatically downweights generic words while highlighting distinctive language patterns.
+    We use 1-2 word phrases (bigrams) to capture meaningful combinations like "works great" or "poor quality."
+    <br><br>
+    <strong>2. Standard Scaler</strong> — Normalizing review metrics<br>
+    Review length and title length are scaled to have mean=0 and standard deviation=1. This prevents longer reviews from
+    dominating the model simply due to scale differences.
+    <br>
+    <strong>Known limitation:</strong> We discovered that helpfulness has a non-linear relationship with length. Very short and
+    very long reviews both tend to receive fewer helpful votes, with medium-length reviews performing best. Our linear scaling
+    doesn't fully capture this relationship, suggesting polynomial features or binning could improve future iterations.
+    <br><br>
+    <strong>3. Truncated SVD</strong> — Dimensionality reduction for efficiency<br>
+    After TF-IDF, our feature space explodes to 200,000+ dimensions (one for each unique word/phrase). We use Truncated Singular
+    Value Decomposition to compress this down to <strong>800 components</strong> while retaining <strong>70% of the variance</strong>.
+    This dramatically speeds up training while maintaining predictive power.
+    <br>
+    <strong>Why Truncated SVD over PCA?</strong> It works directly with sparse matrices (TF-IDF produces mostly zeros), making it
+    far more memory-efficient. We tuned the component count by balancing F1-macro score against model complexity.
+    <br><br>
+    <strong>4. Stochastic Gradient Descent Classifier (SGDC)</strong> — The final predictor<br>
+    We compared five models: Decision Trees, K-Nearest Neighbors, Linear SVM, XGBoost, and SGDC. <strong>SGDC emerged as the best
+    overall performer,</strong>narrowly beating XGBoost on the gains curve (a metric measuring how well the model prioritizes truly
+    helpful reviews at the top of its predictions).
+    <br><br>
+    <strong>Key tuning decisions:</strong>
+    <ul>
+    <li><strong>class_weight='balanced'</strong>: Our data is imbalanced (80% of reviews have zero helpful votes), so we weighted
+    the minority class to prevent the model from simply predicting "not helpful" for everything</li>
+    <li><strong>loss='modified_huber'</strong>: Provides probability estimates (needed for ranking) while being robust to outliers</li>
+    <li><strong>early_stopping=True</strong>: Prevents overfitting by monitoring validation performance</li>
+    </ul>
+    <strong>Why SGDC over XGBoost?</strong> While XGBoost had slightly better raw accuracy (72% vs 71.7%), SGDC showed better
+    generalization, faster training, and superior performance on the gains curve, meaning it does a better job surfacing the
+    <em>most</em> helpful reviews, which is what matters for practical use.
+    </div>
+    """, unsafe_allow_html=True)
+    st.markdown("### Model Performance & Insights")
+    st.markdown("""
+    <div class="custom-section">
+    <strong>What makes a review helpful?</strong> Our analysis revealed three key patterns:
+    <ul>
+    <li><strong>Including an image</strong> significantly increases helpfulness</li>
+    <li><strong>Medium-length reviews</strong> (not too short, not too long) perform best</li>
+    <li><strong>Specific vocabulary</strong> varies by product category — suggesting category-specific models could further improve accuracy</li>
+    </ul>
+    <strong>Practical application:</strong> The model outputs probability scores (0-1) that allow you to rank your reviews.
+    Focus on high-probability <strong>negative</strong> reviews to identify product issues early, and elevate high-probability
+    <strong>positive</strong> reviews to guide purchasing decisions.
+    </div>
+    """, unsafe_allow_html=True)
+    st.markdown("### Limitations & Future Improvements")
+    st.markdown("""
+    <div class="custom-section">
+    <strong>Current limitations to be aware of:</strong>
+    <ul>
+    <li><strong>Category-agnostic training</strong> — The model was trained across all product categories. Category-specific models
+    would likely improve accuracy since "helpful" looks different for electronics vs. beauty products</li>
+    <li><strong>Low helpfulness threshold</strong> — We defined "helpful" as 1+ votes due to computational constraints. A higher
+    threshold (e.g., 5+ votes) would be more meaningful but requires training on larger datasets</li>
+    <li><strong>Non-linear length relationships</strong> — As mentioned above, polynomial features could better capture the
+    sweet spot for review length</li>
+    </ul>
+    <strong>What we'd do with more resources:</strong> Train separate models per category, use a higher helpfulness threshold,
+    experiment with transformer-based models (BERT, etc.), and incorporate temporal features (how quickly reviews receive votes).
+    </div>
+    """, unsafe_allow_html=True)
+    st.markdown("""
+    Below you'll find the specific hyperparameters tuned using [Optuna](https://optuna.readthedocs.io/en/stable/index.html),
+    an automated hyperparameter optimization framework. Click each section to see the final parameter values and learn more
+    about the methods used.
+    """)
+    pre, pred = st.columns(2)
+    with pre.expander("**Preprocessing Steps**"):
+        col1, col2, col3, col4 = st.columns(4)
+        with col1.popover("TF-IDF Title"):
+            st.write("Learn more about tf-idf [here](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)")
+            st.code("""
+{'max_df': 0.95,
+'min_df': 1,
+'ngram_range': (1, 2),
+'stop_words': 'english',
+'sublinear_tf': True}
+                """)
+        with col2.popover("TF-IDF Text"):
+            st.write("""The sci-kit native english stop words argument was used here for convenience, however there are [known issues](https://scikit-learn.org/stable/modules/feature_extraction.html#stop-words)
+                     so a future iteration of this project might find improvement is using a more robust selection of stop words and including ones that
+                     are custom to the specific domain being modeled""")
+            st.code("""
+{'max_df': 0.9,
+ 'min_df': 2,
+ 'ngram_range': (1, 2),
+ 'stop_words': 'english',
+ 'sublinear_tf': True}
+                    """)
+        with col3.popover("Standard Scaler"):
+            st.write("Default settings for [Standard Scaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html) were used to scale review length and title length")
+        with col4.popover("Truncated SVD"):
+            st.write("The only parameter changed was `n_components`.  Value used was 800.  [Truncated SVD](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html)")
+    with pred.expander("**Predictive Model**"):
+        st.write("Model used was Stochastic Gradient Descent Classifier [(SGDC)](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html)")
+        st.code("""
+{'alpha': 0.0002,
+ 'class_weight': 'balanced',
+ 'early_stopping': True,
+ 'eta0': 0.001,
+ 'l1_ratio': 0.9,
+ 'learning_rate': 'adaptive',
+ 'loss': 'modified_huber',
+ 'max_iter': 500,
+ 'n_iter_no_change': 8,
+ 'penalty': 'elasticnet',
+ 'validation_fraction': 0.15}
+                """)
+        st.write("Most important parameters were the loss function, `class_weight` and `early_stopping`.  Every other parameter tuned lead to marginal improvements")

tabs/predictive_model_tabs/pred_model_two.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import streamlit as st
+import pandas as pd
+from utils.load_data import load_dataset
+from utils.plot_gains import plot_gains
+from utils.load_pred_model import load_model, load_demo_data
+def render():
+    model = load_model() # Using the cached function
+    df = load_demo_data()
+    model_columns = ['lemma_title', 'lemma_text', 'images', 'Review Length', 'Title Length']
+    X = df[model_columns]
+    y = df['vote']
+    st.session_state.demo_probs = model.predict_proba(X)
+    with st.expander("Step 1: Load Data and review", icon="🧐"):
+        st.write("You'll first upload your dataset (in CSV or Parquet format) and review it to make sure everything is guchi.  Below is what the final dataset looks like.  We'll take care of the preprocessing steps, the only columns you need to ensure exist in the uploaded data are the review title and text columns, images and the number of votes that review has so far if using existing data")
+        st.dataframe(df)
+    with st.expander("Step 2: Get Predictions", icon=':material/self_improvement:'):
+        st.write("The next thing we'll do is use the model to make predictions on your data. " \
+        "For our purposes, we are predicting the probability that the review belongs to the positive class")
+        prob_button = st.button("Push to predict", icon='🎆', type='secondary')
+        if prob_button:
+            prob_df = pd.DataFrame({
+                "Actual": y,
+                "Probability of helpful vote": st.session_state.demo_probs[:, 1]
+            })
+            st.dataframe(prob_df)
+    with st.expander("Step 3: Plots the gains", icon=":material/data_thresholding:"):
+        st.write("Once we have our predictions, we can plot the gains curve which shows us the subset of our data is worth focusing on")
+        if st.button("Plot the gains", icon="🤪", type="secondary"):
+            fig, data, total = plot_gains(y, st.session_state.demo_probs[:, 1])
+            st.plotly_chart(fig)
+            st.write(f"""We can see from this plot that our best return comes from focusing on the top **{round(total*100, 2)}%** of our customers,
+             which will lead to us capturing **{round(data*100, 2)}%** of all possible possible cases""")

tabs/user_page_tabs/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .user_about import render as render_about
+from .user_chat_analysis import render as render_analysis
+from .user_pred_model import render as render_pred
+from .user_topic_model import render as render_topic
+__all__ = ['render_about', 'render_analysis', 'render_pred', 'render_topic']

tabs/user_page_tabs/user_about.py ADDED Viewed

	@@ -0,0 +1,223 @@

+import streamlit as st
+def render():
+    st.markdown("""
+    <style>
+    @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;600&display=swap');
+    .custom-section {
+        font-family: 'Inter', sans-serif;
+        font-size: 16px;
+        line-height: 1.4;
+    }
+    .custom-section strong {
+        font-weight: 600;
+        color: #2E86AB;
+        display: inline-block;
+        margin-bottom: 8px;
+    }
+    .custom-section ol,
+    .custom-section ul {
+        margin-top: 0 !important;
+        margin-bottom: 10px !important;
+        padding-top: 0 !important;
+    }
+    </style>
+    """, unsafe_allow_html=True)
+    st.markdown("## 👤 Welcome to Your Workspace")
+    st.markdown("""
+    <div class="custom-section">
+    This is your personal analysis environment where you can upload your own customer review data, discover topics
+    within your reviews, predict which reviews will be most helpful, and chat with an AI assistant to interpret
+    your results.
+    </div>
+    """, unsafe_allow_html=True)
+    st.markdown("""
+    <hr style='
+        border: none;
+        height: 2px;
+        background: linear-gradient(to right, #2E86AB, #87ceeb, #2E86AB);
+        margin: 20px 0;
+    '>
+    """, unsafe_allow_html=True)
+    st.markdown("## 📊 Required Data Format")
+    st.markdown("""
+    <div class="custom-section">
+    Your dataset must include the following columns to use all features of this workspace.
+    </div><br>
+    """, unsafe_allow_html=True)
+    st.markdown("""
+    | Column Name | Data Type | Description | Required For |
+    |------------|-----------|-------------|--------------|
+    | `review_title` or `title` | string | The title of the customer review | Topic Modeling, Predictive Model |
+    | `text` | string | The full text content of the review | Topic Modeling, Predictive Model |
+    | `images` | boolean/integer | Binary indicator (0/1 or True/False) if the review includes images | Predictive Model |
+    | `helpful_vote` | integer | Number of helpful votes the review received | Predictive Model (target variable) |
+    | `rating` | integer (1-5) | Star rating given by the reviewer | Optional filtering |
+    **Important notes:**
+    - Text columns can contain raw review text (HTML tags will be automatically removed)
+    - For best results, include at least 1,000+ reviews
+    - The `helpful_vote` column is used as the target variable (0 = not helpful, 1+ = helpful)
+    """)
+    st.markdown("""
+    <hr style='
+        border: none;
+        height: 2px;
+        background: linear-gradient(to right, #2E86AB, #87ceeb, #2E86AB);
+        margin: 20px 0;
+    '>
+    """, unsafe_allow_html=True)
+    st.markdown("## 🗺️ How to Use This Workspace")
+    st.markdown("""
+    <div class="custom-section">
+    <strong>Recommended Workflow:</strong><br>
+    Follow these steps to get the most value from your analysis:
+    </div><br>
+    """, unsafe_allow_html=True)
+    st.markdown("""
+    <div class="custom-section">
+    <strong>1️⃣ Topic Modeling Tab</strong><br>
+    <strong>Purpose:</strong> Discover the main themes and topics discussed in your reviews
+    <strong>What you'll do:</strong>
+    <ul>
+    <li>Upload your dataset (CSV or Parquet format)</li>
+    <li>Select which text columns to analyze (review title, text, or both)</li>
+    <li>Configure topic modeling parameters (number of topics, n-gram range)</li>
+    <li>Optional: Filter by rating or helpful votes</li>
+    <li>Run the NMF topic model and visualize results</li>
+    </ul><br>
+    <strong>What you'll learn:</strong> The model extracts the most important words/phrases for each topic,
+    helping you understand what customers are talking about. You can see which topics dominate your reviews
+    and how they differ across ratings.
+    <br><br>
+    <strong>2️⃣ Predictive Modeling Tab</strong><br>
+    <strong>Purpose:</strong> Identify which reviews are most likely to be found helpful by other customers
+    <strong>What you'll do:</strong>
+    <ul>
+    <li>Choose to reuse data from Topic tab or upload new data</li>
+    <li>Click "Prep text & features" to process your reviews (lemmatization, feature engineering)</li>
+    <li>Run the prediction model to generate helpfulness probability scores</li>
+    <li>View predictions and explore the gains curve</li>
+    </ul>
+    <strong>What you'll learn:</strong> The model assigns each review a probability score (0-1) indicating
+    how likely it is to receive helpful votes. Use this to:
+    <ul>
+    <li>Prioritize which negative reviews to investigate for product issues</li>
+    <li>Identify positive reviews worth promoting</li>
+    <li>Understand what characteristics make reviews helpful in your domain</li>
+    </ul>
+    <strong>Pro tip:</strong> The gains curve shows how efficiently the model identifies helpful reviews.
+    If you can find 70% of helpful reviews by only reading the top 20% ranked by the model, that's significant time savings!
+    <br><br>
+    <strong>3️⃣ Chat Analysis Tab</strong><br>
+    <strong>Purpose:</strong> Get AI-powered insights and interpretations of your results
+    <strong>What you'll do:</strong>
+    <ul>
+    <li>Enter your OpenAI API key in the sidebar</li>
+    <li>Ask questions about your topic modeling results</li>
+    <li>Get help interpreting model performance metrics</li>
+    <li>Discuss what actions to take based on your findings</li>
+    </ul><br>
+    <strong>What you'll learn:</strong> The AI assistant has context about your entire analysis (dataset stats,
+    topics discovered, model performance) and can help you:
+    <ul>
+    <li>Name and interpret the topics discovered</li>
+    <li>Understand what the model performance metrics mean in practical terms</li>
+    <li>Generate actionable recommendations based on your results</li>
+    <li>Answer "what if" questions about your data</li>
+    </ul><br>
+    <strong>Example questions to ask:</strong>
+    <ul>
+    <li>"What are the main topics in my reviews and what should I name them?"</li>
+    <li>"Which reviews should I prioritize reading first?"</li>
+    <li>"What does the gains curve tell me about my model's performance?"</li>
+    <li>"Are there patterns in the helpful vs non-helpful reviews?"</li>
+    </ul>
+    </div>
+    """, unsafe_allow_html=True)
+    st.markdown("""
+    <hr style='
+        border: none;
+        height: 2px;
+        background: linear-gradient(to right, #2E86AB, #87ceeb, #2E86AB);
+        margin: 20px 0;
+    '>
+    """, unsafe_allow_html=True)
+    st.markdown("## 💡 Best Practices")
+    st.markdown("\n")
+    st.markdown("""
+    <div class="custom-section">
+    <strong>Data Quality</strong>
+    <ul>
+    <li>Include at least 1,000 reviews for meaningful topic modeling</li>
+    <li>Ensure text fields don't have excessive missing values</li>
+    <li>If most reviews have 0 helpful votes, consider filtering to reviews with at least 1 vote for topic modeling</li>
+    </ul><br>
+    <strong>Topic Modeling Tips</strong>
+    <ul>
+    <li>Start with 5-7 topics and adjust based on results</li>
+    <li>Use bigrams (1-2 word phrases) for more interpretable topics</li>
+    <li>Add domain-specific stopwords in the optional parameters if common words dominate</li>
+    <li>Try filtering by rating to see how topics differ between positive and negative reviews</li>
+    </ul><br>
+    <strong>Predictive Modeling Tips</strong>
+    <ul>
+    <li>Review the gains curve to understand model performance — steep initial rise = efficient identification of helpful reviews</li>
+    <li>Focus on the probability scores, not just binary predictions</li>
+    <li>Sort predictions by probability to create a priority reading list</li>
+    <li>Cross-reference with ratings: high-probability negative reviews = priority issues</li>
+    </ul><br>
+    <strong>Chat Analysis Tips</strong>
+    <ul>
+    <li>Be specific with your questions for better answers</li>
+    <li>Ask the AI to suggest names for topics based on the top words</li>
+    <li>Use it to brainstorm action items based on your findings</li>
+    <li>Click "Reset Chat" to start fresh if the conversation gets off track</li>
+    </ul>
+    </div>
+    """, unsafe_allow_html=True)
+    st.markdown("""
+    <hr style='
+        border: none;
+        height: 2px;
+        background: linear-gradient(to right, #2E86AB, #87ceeb, #2E86AB);
+        margin: 20px 0;
+    '>
+    """, unsafe_allow_html=True)
+    st.markdown("## 🚀 Ready to Get Started?")
+    st.markdown("""
+    <div class="custom-section">
+    Head to the <strong>Topic Modeling</strong> tab to upload your data and begin your analysis!
+    </div>
+    """, unsafe_allow_html=True)

tabs/user_page_tabs/user_chat_analysis.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import streamlit as st
+from openai import OpenAI
+import numpy as np
+def system_prompt():
+    # Helper to find target column
+    def find_vote_column(df):
+        vote_cols = [col for col in df.columns if 'vote' in col.lower()]
+        return vote_cols[0] if vote_cols else None
+    # Helper to get key model params only
+    def get_key_params(params_dict):
+        key_params = ['alpha', 'class_weight', 'l1_ratio', 'loss', 'max_iter',
+                    'penalty', 'validation_fraction', 'learning_rate', 'eta0']
+        return {k: v for k, v in params_dict.items() if k in key_params}
+    # Build system prompt
+    vote_col = find_vote_column(st.session_state.raw_df) if 'raw_df' in st.session_state else None
+    target_dist = ""
+    if vote_col and 'raw_df' in st.session_state:
+        pos_rate = (st.session_state.raw_df[vote_col] > 0).mean() * 100
+        target_dist = f"\n- Target distribution: {pos_rate:.2f}% of reviews have helpful votes"
+    context = f"""You are an expert data science assistant helping users analyze customer review data. Your role is to help users understand and
+    interpret their analyses naturally - answer questions conversationally without dumping context unless directly relevant.  Start each conversation by asking
+    the user where they would like to begin without giving a bunch of information and without providing any starting points.  Let the user guide the conversation.
+    Your role is to be responsive, not proactive.
+    Below is the information from the customer review data the user uploaded.  There is some basic information about the data, the results of topic modeling using NMF, the results
+    of the predictive modeling steps, and modeling performance on their data.  Use the below information to guide your answers to their questions.
+    ## Dataset Overview
+    - Shape: {st.session_state.raw_df.shape[0]:,} reviews × {st.session_state.raw_df.shape[1]} columns
+    - Columns: {', '.join(st.session_state.raw_df.columns)}{target_dist}
+    ## Topic Modeling (NMF)
+    {"- Status: Not yet run" if not st.session_state.get('topics_fitted') else f'''- Analyzed columns: {', '.join(st.session_state.get('topic_columns', []))}
+    - TF-IDF n-grams: {st.session_state.get('topic_ngram')}
+    - Discovered: {len(st.session_state.get('top_topics', {}))}
+    - Top words and corresponding weights in percentages rounded to 4 digits {st.session_state.get('top_topics', {})}
+    - Each word or phase is prepended by the preprocessing step and column followed by two underscores __.  Ignore the prefixes and only refer to the words'''}
+    ## Predictive Model (Helpful Vote Prediction)
+    {"- Status: Not yet run" if not st.session_state.get('model_run') else f'''- Purpose: Identify reviews likely to receive helpful votes (for elevation/analysis)
+    - Architecture: TF-IDF → TruncatedSVD → SGD Classifier
+    - Title n-grams: {st.session_state.get('title_ngram', 'N/A')}
+    - Text n-grams: {st.session_state.get('text_ngram', 'N/A')}
+    - SVD: {st.session_state.get('svd_comps')} components ({st.session_state.get('explained_variance'):.1f}% variance explained)
+    - Model config (key params): {get_key_params(st.session_state.get('model_params', {}))}
+    - Features used: {', '.join(list(st.session_state.X.columns))}'''}
+    {"" if not st.session_state.get('model_run') or not st.session_state.get('accuracy') else f'''## Model Performance
+    - Accuracy: {st.session_state.get('accuracy', 0):.2f}%
+    - Precision: {st.session_state.get('precision', 0):.2f}%
+    - Recall: {st.session_state.get('recall', 0):.2f}%
+    - F1 Score: {st.session_state.get('f1', 0):.2f}%
+    - KS statistic: {st.session_state.get('ks_value', 0):.2f}
+    - Peak efficiency: Captures {st.session_state.get('peak_gains', 0):.2f}% of all helpful reviews by reviewing just {st.session_state.get('percent_data', 0):.4f}% of data
+    - Interpretation: This shows the lift over random selection for identifying valuable reviews'''}
+    ## Guidelines
+    - Be conversational and concise - only cite specific numbers when directly relevant
+    - Help interpret results in business terms (which reviews to prioritize, what topics matter)
+    - When discussing topics, reference them by their top words, not just numbers.  Suggest potential topic names based on the top words for each topic when asked by the user
+    - When discussing model performance, focus on practical implications (e.g., "you can find 80% of helpful reviews by only reading the top 20%")
+    - Ask clarifying questions when the user's question is ambiguous
+    - Suggest analyses only when naturally relevant to the conversation"""
+    return context
+def render():
+    st.set_page_config(page_title="Chat", layout="centered")
+    # --- Sidebar ---
+    openai_api_key = st.sidebar.text_input("OpenAI API Key", type="password")
+    if not openai_api_key:
+        st.info("Enter your OpenAI API key in the sidebar to use the chat.")
+        st.stop()
+    client = OpenAI(api_key=openai_api_key)
+    if st.sidebar.button("Reset Chat", type = 'primary'):
+        st.session_state.messages = []
+    # --- State ---
+    if "messages" not in st.session_state:
+        st.session_state.messages = []
+    # --- Put the chat UI ABOVE the input by creating a container first ---
+    chat_box = st.container()
+    with chat_box:
+        for msg in st.session_state.messages:
+            with st.chat_message(msg["role"]):
+                st.markdown(msg["content"])
+    # --- Keep input LAST so it stays at the bottom ---
+    user_input = st.chat_input("Say something")
+    SYSTEM_PROMPT = system_prompt()
+    if user_input:
+        # Add user message to state immediately
+        st.session_state.messages.append({"role": "user", "content": user_input})
+        # Render the new user message + stream assistant IN THE chat_box (above input)
+        with chat_box:
+            with st.chat_message("user"):
+                st.markdown(user_input)
+            with st.chat_message("assistant"):
+                # System prompt goes first (not shown in UI unless you add it to history)
+                model_messages = [{"role": "system", "content": SYSTEM_PROMPT}]
+                model_messages += [
+                    {"role": m["role"], "content": m["content"]}
+                    for m in st.session_state.messages
+                ]
+                stream = client.chat.completions.create(
+                    model="gpt-5-nano",
+                    messages=model_messages, # type: ignore
+                    stream=True,
+                ) # type: ignore
+                response = st.write_stream(stream)
+        st.session_state.messages.append({"role": "assistant", "content": response})
+        # Optional but recommended: forces the “final” assistant message to appear
+        # in the history immediately (and keeps layout stable).
+        st.rerun()

tabs/user_page_tabs/user_pred_model.py ADDED Viewed

	@@ -0,0 +1,322 @@

+from __future__ import annotations
+import streamlit as st
+import pandas as pd
+import numpy as np
+from utils.load_pred_model import load_model
+from utils.prepare_user_dataframe import prep_text_and_features
+from utils.plot_gains import plot_gains
+from utils.load_user_data import load_uploaded_file
+def _initialize_model(model):
+    """
+    Load the model and set session state variables.
+    Only runs when needed.
+    """
+    st.session_state.svd_comps = model[0][1].n_components
+    st.session_state.explained_variance = round(model[0][1].explained_variance_ratio_.sum() * 100, 2)
+    st.session_state.title_ngram = model[0][0]['tfidf_title'].ngram_range
+    st.session_state.text_ngram = model[0][0]['tfidf_text'].ngram_range
+    st.session_state.model_params = model[1].get_params()
+    st.session_state.model_loaded = True
+# -------------------------------------------------------------------
+# Session state helpers
+# -------------------------------------------------------------------
+def _init_state() -> None:
+    """
+    Initialize all session_state keys used in this tab.
+    """
+    defaults = {
+        # Shared across tabs
+        "user_base_df": None,         # original uploaded df from topic tab
+        "user_processed_df": None,    # processed df with lemma_* etc.
+        "user_raw_df": None,          # alias kept for backwards compatibility
+        "raw_df": None,               # alias kept for backwards compatibility
+        # Prediction-specific
+        "prepped_df": None,
+        "X": None,
+        "true_y": None,
+        "probs": None,
+        "prep_done": False,
+        "model_run": False,
+        "active_file_name": None,
+        "data_source": None,          # "topic_tab" or "upload"
+    }
+    for key, value in defaults.items():
+        if key not in st.session_state:
+            st.session_state[key] = value
+def _validate_schema(df: pd.DataFrame) -> bool:
+    """
+    Check that the dataframe has the columns required for the prediction model.
+    Allows extra columns; only errors on missing required columns.
+    """
+    required = {"helpful_vote", "review_title", "text", "images"}
+    df_cols = set(df.columns)
+    missing = required - df_cols
+    if missing:
+        st.error(
+            "The uploaded dataset is missing required columns for prediction: "
+            + ", ".join(sorted(missing))
+        )
+        with st.expander("View dataframe columns"):
+            st.write(sorted(df.columns.tolist()))
+        return False
+    return True
+def _set_active_dataframe(df: pd.DataFrame, source: str, file_name: str | None = None) -> None:
+    """
+    Store the active dataframe in session_state and reset dependent state.
+    """
+    # For prediction we treat this df as the active working df
+    st.session_state.user_raw_df = df
+    st.session_state.raw_df = df  # legacy alias
+    st.session_state.data_source = source
+    st.session_state.active_file_name = file_name
+    # Reset downstream state
+    st.session_state.prepped_df = None
+    st.session_state.X = None
+    st.session_state.true_y = None
+    st.session_state.probs = None
+    st.session_state.prep_done = False
+    st.session_state.model_run = False
+# -------------------------------------------------------------------
+# Main render function
+# -------------------------------------------------------------------
+def render() -> None:
+    _init_state()
+    model = load_model()
+    _initialize_model(model)
+    st.header("User Prediction Model")
+    st.markdown(
+        "Use this tab to run the **helpful-vote prediction model** on your dataset. "
+        "You can reuse the dataset from the topic modeling tab or upload a new file."
+    )
+    # Prefer the processed df from the topic tab if it exists
+    if "user_processed_df" in st.session_state:
+        topic_df = st.session_state["user_processed_df"]
+    elif "user_raw_df" in st.session_state:
+        topic_df = st.session_state["user_raw_df"]
+    else:
+        topic_df = None
+    has_topic_df = topic_df is not None
+    # -----------------------------
+    # Choose data source
+    # -----------------------------
+    if has_topic_df:
+        source = st.radio(
+            "Choose data source:",
+            ["Use data from Topic Modeling tab", "Upload new dataset"],
+            horizontal=True,
+        )
+    else:
+        source = "Upload new dataset"
+        st.info(
+            "No dataset found from the Topic Modeling tab. "
+            "Please upload a dataset to continue."
+        )
+    active_df: pd.DataFrame | None = None
+    # -----------------------------
+    # Option 1: reuse data from topic tab
+    # -----------------------------
+    if source == "Use data from Topic Modeling tab":
+        if not has_topic_df:
+            st.warning("No dataset available from the Topic Modeling tab.")
+            st.stop()
+        df = topic_df
+        if df is not None and _validate_schema(df):
+            # Only reset active dataframe if we weren't already using topic_tab
+            if st.session_state.data_source != "topic_tab":
+                _set_active_dataframe(df, source="topic_tab", file_name="from_topic_tab")
+            # Use the processed/topic df directly
+            active_df = df
+    # -----------------------------
+    # Option 2: upload one or more files
+    # -----------------------------
+    else:
+        uploaded_files = st.file_uploader(
+            "Upload one or more data files (CSV or Parquet)",
+            type=["csv", "parquet"],
+            key="predictive_data",
+            accept_multiple_files=True,
+        )
+        chosen_file = None
+        if uploaded_files:
+            if len(uploaded_files) == 1:
+                chosen_file = uploaded_files[0]
+            else:
+                file_names = [f.name for f in uploaded_files]
+                chosen_name = st.selectbox("Select which file to use", file_names)
+                chosen_file = next(f for f in uploaded_files if f.name == chosen_name)
+        if chosen_file is not None:
+            needs_new = (
+                st.session_state.data_source != "upload"
+                or st.session_state.active_file_name != chosen_file.name
+            )
+            if needs_new:
+                df = load_uploaded_file(chosen_file)
+                if df is not None and _validate_schema(df):
+                    _set_active_dataframe(
+                        df, source="upload", file_name=chosen_file.name
+                    )
+            # Use whatever is currently active (may be newly set above)
+            active_df = st.session_state.user_raw_df
+        elif (
+            st.session_state.data_source == "upload"
+            and st.session_state.user_raw_df is not None
+        ):
+            # User uploaded a file earlier; reuse it
+            active_df = st.session_state.user_raw_df
+    # If we still don't have an active dataframe, bail out
+    if active_df is None and st.session_state.user_raw_df is None:
+        st.stop()
+    if active_df is None:
+        active_df = st.session_state.user_raw_df
+    # -----------------------------
+    # Data preview
+    # -----------------------------
+    st.markdown("### Active dataset")
+    if st.session_state.active_file_name:
+        st.caption(
+            f"Using data source: **{st.session_state.data_source}** "
+            f"({st.session_state.active_file_name})"
+        )
+    else:
+        st.caption(f"Using data source: **{st.session_state.data_source or 'unknown'}**")
+    with st.expander("Preview first 5 rows"):
+        st.dataframe(active_df, width='stretch')
+    # -----------------------------
+    # Prep text & features
+    # -----------------------------
+    st.markdown("### Step 1 – Prepare text and features")
+    prep_col1, prep_col2 = st.columns([1, 3])
+    with prep_col1:
+        prep_clicked = st.button(
+            "Prep text & features",
+            type="primary",
+            help="Strip HTML, lemmatize text, and create engineered features.",
+        )
+    if prep_clicked:
+        try:
+            # 👇 Pass the *currently active* dataframe. If it has lemma_* columns
+            # from the topic tab, prep_text_and_features will reuse them and skip
+            # re-lemmatization.
+            prep_text_and_features(model, df=active_df)
+        except Exception as e:
+            st.error(f"Error during preprocessing: {e}")
+    if st.session_state.prep_done:
+        with prep_col2:
+            st.info(
+                "Preprocessing complete. You can now run the model and explore the results."
+            )
+        with st.expander("View prepped dataframe (first 5 rows)"):
+            st.dataframe(
+                st.session_state.prepped_df.head(), width="stretch"
+            )
+    # -----------------------------
+    # Run model
+    # -----------------------------
+    st.markdown("### Step 2 – Run prediction model")
+    if not st.session_state.prep_done:
+        st.warning("Please run *Prep text & features* before running the model.")
+        st.stop()
+    go_ahead = st.button(
+        label="Run model?",
+        type="primary",
+        icon="🔥",
+        help="Generate predicted helpful-vote probabilities.",
+    )
+    if go_ahead:
+        X = st.session_state.X
+        st.session_state.probs = model.predict_proba(X)
+        st.session_state.model_run = True
+        st.success("🔥 Model predictions have been generated.")
+    # -----------------------------
+    # Results & gains curve
+    # -----------------------------
+    st.markdown("### Step 3 – Explore results")
+    if not st.session_state.model_run:
+        st.info("Run the model to unlock predictions and the gains curve.")
+        return
+    true_y = st.session_state.true_y
+    probs = st.session_state.probs
+    show_table = st.checkbox("Show prediction table", value=True)
+    show_gains = st.checkbox("Show gains curve", value=True)
+    if show_table:
+        compare = pd.DataFrame(
+            {
+                "True Values": true_y,
+                "P(0 – No helpful votes)": probs[:, 0],
+                "P(1+ helpful votes)": probs[:, 1],
+            }
+        )
+        st.markdown("#### Prediction probabilities")
+        st.dataframe(compare.head(200), width="stretch")
+    if show_gains:
+        st.markdown("#### Gains curve")
+        fig, data, total = plot_gains(true_y, probs[:, 1])
+        st.plotly_chart(fig, width="stretch")
+    if st.session_state.model_run:
+        from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
+        y_true = st.session_state.true_y
+        y_pred = model.predict(st.session_state.X)
+        st.session_state.accuracy = round(accuracy_score(y_true, y_pred) * 100, 4)
+        st.session_state.precision = np.round(precision_score(y_true, y_pred) * 100, 4)
+        st.session_state.recall = np.round(recall_score(y_true, y_pred) * 100, 4)
+        st.session_state.f1 = np.round(f1_score(y_true, y_pred) * 100, 4)

tabs/user_page_tabs/user_topic_model.py ADDED Viewed

	@@ -0,0 +1,313 @@

+import streamlit as st
+import pandas as pd
+from typing import List, Optional, Tuple
+from sklearn.compose import ColumnTransformer
+from sklearn.decomposition import NMF
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.pipeline import make_pipeline, Pipeline
+from utils.build_plotly import _build_topic_figure
+from utils.load_user_data import load_uploaded_file
+from utils.prepare_user_dataframe import prep_text_column
+from nltk.corpus import stopwords  # type: ignore
+import plotly.graph_objects as go  # type: ignore
+import pandas.api.types as ptypes
+# Build stopword list (don’t mutate across calls)
+BASE_STOPWORDS = set(stopwords.words("english"))
+CUSTOM_KEEP = {
+    'not','no','but','ain','don',"don't",'aren',"aren't",'couldn',"couldn't",
+    'didn',"didn't",'doesn',"doesn't",'hadn',"hadn't",'hasn',"hasn't",'haven',
+    "haven't",'isn',"isn't",'mightn',"mightn't",'mustn',"mustn't",'needn',
+    "needn't",'shan',"shan't",'shouldn',"shouldn't",'wasn',"wasn't",'weren',
+    "weren't",'won',"won't",'wouldn',"wouldn't",'very','too'
+}
+DEFAULT_STOPWORDS = sorted(BASE_STOPWORDS - CUSTOM_KEEP)
+@st.cache_data(show_spinner="One moment please!", show_time=True)
+def make_topics(
+    df: pd.DataFrame,
+    user_title: str,
+    topic_columns: List[str],
+    n1: int,
+    n2: int,
+    n_components: int,
+    rating: Optional[List[int]] = None,
+    helpful_vote: Optional[int] = None,
+    new_words: Optional[List[str]] = None,
+    n_top_words: int = 5,
+) -> Tuple[ColumnTransformer | Pipeline, go.Figure]:
+    """
+    Fit TF-IDF + NMF topic model and return (pipeline, Plotly figure).
+    """
+    # Start from the input df (this will usually be st.session_state.user_processed_df)
+    base_df = df
+    # Validate selected columns exist
+    selected_cols = [col for col in topic_columns if col in base_df.columns]
+    if not selected_cols:
+        st.error("No valid columns selected for topic modeling.")
+        raise ValueError("No valid columns selected for topic modeling.")
+    # Check for text vs non-text columns
+    text_cols: list[str] = []
+    non_text_cols: list[str] = []
+    for col in selected_cols:
+        col_series = base_df[col]
+        if ptypes.is_string_dtype(col_series) or ptypes.is_object_dtype(col_series):
+            text_cols.append(col)
+        else:
+            non_text_cols.append(col)
+    if non_text_cols:
+        st.error(
+            "The following columns are not text columns and will be dropped "
+            f"from topic modeling: {', '.join(non_text_cols)}"
+        )
+    if not text_cols:
+        st.error("None of the selected columns are text columns. Please select text columns.")
+        raise ValueError("No text columns available for topic modeling.")
+    # Build stopword list
+    stop_list = DEFAULT_STOPWORDS.copy()
+    if new_words:
+        stop_list.extend(new_words)
+    # ------------------------------------------------------------------
+    # Ensure lemma columns exist on the *processed* dataframe
+    # ------------------------------------------------------------------
+    lemma_cols: list[str] = []
+    transformers = []
+    for col in text_cols:
+        if col == "review_title":
+            lemma_col = "lemma_title"
+        elif col == "text":
+            lemma_col = "lemma_text"
+        else:
+            lemma_col = f"lemma_{col}"
+        base_df = prep_text_column(
+            df=base_df,
+            text_col=col,
+            lemma_col=lemma_col,
+            overwrite=False,
+        )
+        lemma_cols.append(lemma_col)
+        transformers.append(
+            (
+                f"tfidf_{col}",
+                TfidfVectorizer(stop_words=stop_list, ngram_range=(n1, n2)),
+                lemma_col,
+            )
+        )
+    # 🔁 Persist updated processed df back into session_state so the
+    # prediction tab can reuse lemma_* columns without re-lemmatizing.
+    st.session_state.user_processed_df = base_df
+    # Backwards-compat aliases used elsewhere in the app
+    st.session_state.raw_df = base_df
+    st.session_state.user_raw_df = base_df
+    # ------------------------------------------------------------------
+    # Working copy for filtering + modeling
+    # ------------------------------------------------------------------
+    work_df = base_df.copy()
+    if rating is not None and "rating" in work_df.columns:
+        work_df = work_df[work_df["rating"].isin(rating)]
+    if helpful_vote is not None and "helpful_vote" in work_df.columns:
+        work_df = work_df[work_df["helpful_vote"] > helpful_vote]
+    preprocessor = ColumnTransformer(transformers)
+    nmf = NMF(
+        n_components=n_components,
+        init="nndsvda",
+        solver="mu",
+        beta_loss=1,
+        random_state=10,
+    )
+    topic_pipeline = make_pipeline(preprocessor, nmf)
+    topic_pipeline.fit(work_df[lemma_cols])
+    feature_names = topic_pipeline[0].get_feature_names_out()
+    nmf_model: NMF = topic_pipeline[1]
+    fig = _build_topic_figure(
+        model=nmf_model,
+        feature_names=feature_names,
+        n_top_words=n_top_words,
+        title=user_title,
+        n_components=n_components,
+        bar_color="#184A90",
+    )
+    return topic_pipeline, fig
+def render() -> None:
+    st.header("Upload Your Data and Run the Topic Model")
+    # --- STEP 1: Upload & basic review ---
+    st.subheader("Step 1: Upload and Review Your Data")
+    uploaded_file = st.file_uploader(
+        label="Upload your dataframe (CSV or Parquet)",
+        type=["csv", "parquet"],
+        key="topic_data",
+        label_visibility="collapsed",
+    )
+    if uploaded_file is not None:
+        file_name = uploaded_file.name
+        prev_name = st.session_state.get("topic_file_name")
+        # Only reload if the user picked a *different* file
+        if prev_name != file_name:
+            df = load_uploaded_file(uploaded_file)
+            st.session_state.topic_file_name = file_name
+            # Original upload (never mutated)
+            st.session_state.user_base_df = df
+            # Working / processed copy (lemma_* columns etc. get added here)
+            processed = df.copy() # type: ignore
+            st.session_state.user_processed_df = processed
+            # Backwards-compat aliases used by other parts of the app
+            st.session_state.raw_df = processed
+            st.session_state.user_raw_df = processed
+    # Prefer the processed dataframe if available
+    if "user_processed_df" in st.session_state:
+        raw_df = st.session_state["user_processed_df"]
+    elif "raw_df" in st.session_state:
+        raw_df = st.session_state["raw_df"]
+    else:
+        raw_df = None
+    if raw_df is None:
+        st.info("👆 Upload a dataframe to begin.")
+        return
+    st.dataframe(raw_df)
+    user_title = st.text_input(
+        "Title for topic plot",
+        value="Topics",
+    )
+    topic_columns = st.multiselect(
+        "Select one or more text columns to topic model",
+        options=list(raw_df.columns),
+    )
+    col1, col2 = st.columns(spec=2, gap="small")
+    with col1:
+        n1 = st.selectbox(
+            "First tfidf n-gram length",
+            options=[1, 2, 3, 4],
+        )
+    with col2:
+        n1_plus = st.selectbox(
+            "Second tfidf n-gram length",
+            options=[0, 1, 2],
+            help=(
+                "This number will add to n1 to create a range of n-gram lengths. "
+                "It's not recommended to go beyond an n-gram length of 4"
+            ),
+        )
+    num_topics = st.slider(
+        "Select number of topics to model",
+        min_value=1,
+        max_value=20,
+        value=5,
+        step=1,
+        help="Any more than 10 isn't usually constructive",
+    )
+    with st.expander("Optional parameters"):
+        st.write(
+            "These are optional parameters that can be selected "
+            "to configure your topic model plot"
+        )
+        tab1, tab2 = st.tabs(["Filters", "Other"])
+        with tab1:
+            st.write(
+                "If available, can filter your dataset by the following "
+                "columns to topic model by a specific rating or number of votes"
+            )
+            rating = st.multiselect(
+                "Rating equal to ...",
+                options=[1, 2, 3, 4, 5],
+            )
+            help_votes = st.number_input(
+                "Votes greater than ... ",
+                min_value=0,
+                max_value=None,
+                value="min",
+                step=1,
+            )
+        with tab2:
+            st.write(
+                "If you have domain specific words that are common, "
+                "removing them can help make specific topics more clear"
+            )
+            stop_words = st.multiselect(
+                "Optional stop words",
+                options=[],
+                placeholder="Add words you want removed",
+                accept_new_options=True,
+            )
+            top_words = st.slider(
+                "Number of top words you want displayed for each topic",
+                min_value=1,
+                max_value=10,
+                value=5,
+                step=1,
+            )
+    if st.button("Run Model", type="primary"):
+        if not topic_columns:
+            st.error("Please select at least one column for topic modeling.")
+            return
+        pipeline, fig = make_topics(
+            df=raw_df,
+            user_title=user_title,
+            topic_columns=topic_columns,
+            n1=n1,
+            n2=n1 + n1_plus,
+            n_components=num_topics,
+            rating=rating or None,
+            helpful_vote=help_votes or None,
+            new_words=stop_words,
+            n_top_words=top_words,
+        )
+        st.plotly_chart(fig)
+        st.session_state.topic_ngram = (n1, n1 + n1_plus)
+        st.session_state.columns = list(raw_df.columns)
+        st.session_state.topic_columns = topic_columns
+        st.session_state.topics_fitted = True

utils/__init__.py ADDED Viewed

File without changes

utils/build_plotly.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import numpy as np
+import streamlit as st
+import plotly.graph_objects as go # type: ignore
+from plotly.subplots import make_subplots # type: ignore
+from sklearn.decomposition import NMF
+    # --------- Plot helper (Plotly) ---------
+def _build_topic_figure(
+    model: NMF,
+    feature_names: np.ndarray,
+    n_top_words: int,
+    title: str,
+    n_components: int,
+    bar_color: str
+) -> go.Figure:
+    """Create a Plotly subplot grid of top terms per topic (horizontal bars)."""
+    # Layout: up to 2 columns, as many rows as needed
+    cols = 2 if n_components > 3 else 1
+    rows = int(np.ceil(n_components / cols))
+    fig = make_subplots(
+        rows=rows,
+        cols=cols,
+        subplot_titles=[f"Topic {i+1}" for i in range(n_components)],
+        horizontal_spacing=0.25,
+        vertical_spacing=0.1
+    )
+    top_features_dict = {}
+    max_weight = 0
+    for topic_idx, topic in enumerate(model.components_):
+        top_features_ind = topic.argsort()[::-1][:n_top_words]
+        top_features = feature_names[top_features_ind]
+        weights = topic[top_features_ind] / np.sum(topic) * 100
+        top_features_dict[topic_idx] = {"features": list(top_features), "weights": list(np.round(weights, 4))}
+        max_weight = max(max_weight, weights.max())
+        # subplot position
+        r = topic_idx // cols + 1
+        c = topic_idx % cols + 1
+        fig.add_trace(
+            go.Bar(
+                x=weights,
+                y=top_features,
+                orientation="h",
+                marker=dict(color=bar_color, line=dict(color="white", width=1)),
+                text=[f"{w:.2f}" for w in weights],
+                textposition="outside",
+                hovertemplate="<b>%{y}</b><br>weight=%{x:.2f}%<extra></extra>",
+                showlegend=False
+            ),
+            row=r, col=c
+        )
+        # nicer y ordering (largest at top)
+        fig.update_yaxes(autorange="reversed", row=r, col=c)
+    # Set x-axis range with padding for all subplots
+    for r_idx in range(1, rows + 1):
+        for c_idx in range(1, cols + 1):
+            fig.update_xaxes(
+                range=[0, max_weight * 1.25],  # Add 25% padding for text labels
+                row=r_idx,
+                col=c_idx
+            )
+    # Axes labels for the bottom row
+    for c in range(1, cols + 1):
+        fig.update_xaxes(title_text="Relative Weight (%)", row=rows, col=c)
+    fig.update_layout(
+        title=f"<b>{title}</b>",
+        height=max(350, 330 * rows),
+        margin=dict(l=50, r=20, t=60, b=60)
+    )
+    st.session_state.top_topics = top_features_dict
+    return fig

utils/icons.py ADDED Viewed

	@@ -0,0 +1,143 @@

+ICONS = {
+    "git-merge": """
+    <svg xmlns="http://www.w3.org/2000/svg" width="{size}" height="{size}" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
+      <circle cx="18" cy="18" r="3"></circle>
+      <circle cx="6" cy="6" r="3"></circle>
+      <path d="M6 9v6c0 2.2 1.8 4 4 4h4"></path>
+      <path d="m18 9-6-6"></path>
+    </svg>
+    """,
+    "scatter-chart": """
+    <svg xmlns="http://www.w3.org/2000/svg" width="{size}" height="{size}" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
+      <path d="M21 21H3V3"></path>
+      <circle cx="10" cy="10" r="1"></circle>
+      <circle cx="17" cy="17" r="1"></circle>
+      <circle cx="7" cy="17" r="1"></circle>
+      <circle cx="17" cy="7" r="1"></circle>
+    </svg>
+    """,
+    "link": """
+    <svg xmlns="http://www.w3.org/2000/svg" width="{size}" height="{size}" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
+      <path d="M10 13a5 5 0 0 0 7.54.54l3-3a5 5 0 0 0-7.07-7.07l-1.72 1.71"></path>
+      <path d="M14 11a5 5 0 0 0-7.54-.54l-3 3a5 5 0 0 0 7.07 7.07l1.71-1.71"></path>
+    </svg>
+    """,
+    "layout-dashboard": """
+    <svg xmlns="http://www.w3.org/2000/svg" width="{size}" height="{size}" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
+      <rect width="7" height="9" x="3" y="3" rx="1"></rect>
+      <rect width="7" height="5" x="14" y="3" rx="1"></rect>
+      <rect width="7" height="9" x="14" y="12" rx="1"></rect>
+      <rect width="7" height="5" x="3" y="16" rx="1"></rect>
+    </svg>
+    """,
+    "table": """
+    <svg xmlns="http://www.w3.org/2000/svg" width="{size}" height="{size}" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
+      <path d="M12 3v18"></path>
+      <rect width="18" height="18" x="3" y="3" rx="2"></rect>
+      <path d="M3 9h18"></path>
+      <path d="M3 15h18"></path>
+    </svg>
+    """,
+    "info": """
+    <svg xmlns="http://www.w3.org/2000/svg" width="{size}" height="{size}" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
+      <circle cx="12" cy="12" r="10"></circle>
+      <path d="M12 16v-4"></path>
+      <path d="M12 8h.01"></path>
+    </svg>
+    """,
+    "calculator": """
+    <svg xmlns="http://www.w3.org/2000/svg" width="{size}" height="{size}" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
+      <rect width="16" height="20" x="4" y="2" rx="2"></rect>
+      <line x1="8" x2="16" y1="6" y2="6"></line>
+      <line x1="12" x2="12" y1="10" y2="18"></line>
+      <line x1="8" x2="16" y1="14" y2="14"></line>
+    </svg>
+    """,
+    "target": """
+    <svg xmlns="http://www.w3.org/2000/svg" width="{size}" height="{size}" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
+      <circle cx="12" cy="12" r="10"></circle>
+      <circle cx="12" cy="12" r="6"></circle>
+      <circle cx="12" cy="12" r="2"></circle>
+    </svg>
+    """,
+    "tag": """
+    <svg xmlns="http://www.w3.org/2000/svg" width="{size}" height="{size}" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
+      <path d="M12 2H2v10l9.29 9.29c.94.94 2.48.94 3.42 0l6.58-6.58c.94-.94.94-2.48 0-3.42L12 2Z"></path>
+      <path d="M7 7h.01"></path>
+    </svg>
+    """,
+    "tags": """
+    <svg xmlns="http://www.w3.org/2000/svg" width="{size}" height="{size}" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
+      <path d="M9 5H2v7l6.29 6.29c.94.94 2.48.94 3.42 0l3.58-3.58"></path>
+      <path d="M13.29 17.71L21 10V3h-7l-6.29 6.29c-.94.94-.94 2.48 0 3.42l3.58 3.58"></path>
+      <path d="M7 7h.01"></path>
+      <path d="M15 15h.01"></path>
+    </svg>
+    """,
+    "trending-up": """
+    <svg xmlns="http://www.w3.org/2000/svg" width="{size}" height="{size}" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
+      <polyline points="22 7 13.5 15.5 8.5 10.5 2 17"></polyline>
+      <polyline points="16 7 22 7 22 13"></polyline>
+    </svg>
+    """,
+    "search": """
+    <svg xmlns="http://www.w3.org/2000/svg" width="{size}" height="{size}" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
+      <circle cx="11" cy="11" r="8"></circle>
+      <path d="m21 21-4.3-4.3"></path>
+    </svg>
+    """,
+    "hash": """
+    <svg xmlns="http://www.w3.org/2000/svg" width="{size}" height="{size}" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
+      <line x1="4" x2="20" y1="9" y2="9"></line>
+      <line x1="4" x2="20" y1="15" y2="15"></line>
+      <line x1="10" x2="8" y1="3" y2="21"></line>
+      <line x1="16" x2="14" y1="3" y2="21"></line>
+    </svg>
+    """,
+    "alert-triangle": """
+    <svg xmlns="http://www.w3.org/2000/svg" width="{size}" height="{size}" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
+      <path d="m21.73 18-8-14a2 2 0 0 0-3.46 0l-8 14A2 2 0 0 0 4 21h16a2 2 0 0 0 1.73-3Z"></path>
+      <path d="M12 9v4"></path>
+      <path d="M12 17h.01"></path>
+    </svg>
+    """,
+    "bar-chart-2": """
+    <svg xmlns="http://www.w3.org/2000/svg" width="{size}" height="{size}" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
+      <line x1="18" x2="18" y1="20" y2="10"></line>
+      <line x1="12" x2="12" y1="20" y2="4"></line>
+      <line x1="6" x2="6" y1="20" y2="14"></line>
+    </svg>
+    """,
+    "bar-chart": """
+    <svg xmlns="http://www.w3.org/2000/svg" width="{size}" height="{size}" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
+      <line x1="12" x2="12" y1="20" y2="10"></line>
+      <line x1="18" x2="18" y1="20" y2="4"></line>
+      <line x1="6" x2="6" y1="20" y2="16"></line>
+    </svg>
+    """
+}
+def lucide_icon(name: str, size: int = 18) -> str:
+    """
+    Return an inline SVG or text placeholder for a Lucide icon.
+    Looks up the given icon name in the ICONS dictionary and returns
+    the corresponding SVG string, formatted with the requested size.
+    If the icon name is not found, falls back to a simple HTML <span>
+    displaying the icon name text at the same approximate size.
+    Args:
+        name (str): The Lucide icon name (e.g., "git-merge", "bar-chart-2").
+        size (int): Desired icon size in pixels. Applied to both width and height.
+    Returns:
+        str: HTML string containing either the SVG markup or a styled text placeholder.
+    """
+    svg = ICONS.get(name)
+    if svg:
+        return svg.format(size=size)
+    else:
+        safe = (name or "").replace("<", "&lt;").replace(">", "&gt;")
+        return f'<span style="font-size:{size}px; line-height:1; opacity:0.75;">{safe}</span>'

utils/load_data.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import pandas as pd
+import streamlit as st
+from pathlib import Path
+import os
+@st.cache_data(show_spinner="Loading data...⏳")
+def load_dataset(path: str | Path, category: str | None=None):
+    DATA_OPTIONS = {
+        'Beauty': 'All_Beauty.parquet',
+        'Appliances': 'Appliances.parquet',
+        'Baby Products': 'Baby_Products.parquet',
+        'Electronics': 'Electronics.parquet',
+        'Health and Household': 'Health_and_Household.parquet',
+        'Movies and TV': 'Movies_and_TV.parquet'
+        }
+    if category:
+        data_path = os.path.join(path, DATA_OPTIONS[category])
+        df = pd.read_parquet(data_path)
+    else:
+        df = pd.read_parquet(path)
+    return df

utils/load_pred_model.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import streamlit as st
+from pathlib import Path
+import pandas as pd
+import joblib
+from huggingface_hub import hf_hub_download
+# def get_root():
+#     # app_utils is inside streamlit_app, so parent is streamlit_app
+#     return Path(__file__).resolve().parent.parent
+# # def get_model_path():
+# #     return get_root() / "models" / "sgdc_pipeline.joblib"
+# def get_data_path():
+#     return get_root() / "models" / "demo_data.parquet"
+# @st.cache_resource(show_spinner='Loading model')
+# def load_model():
+#     path = get_model_path()
+#     if not path.exists():
+#         raise FileNotFoundError(f"Model file not found at: {path}")
+#     return joblib.load(path)
+@st.cache_resource(show_spinner='Loading model') # Use this so it only downloads once per session
+def load_model():
+    # Download the model file from your new Model Repo
+    model_path = hf_hub_download(
+        repo_id="tkbarb10/ads505-prediction-model",
+        filename="sgdc_pipeline.joblib"
+    )
+    # Load the model using joblib (or whatever library you used to save it)
+    return joblib.load(model_path)
+@st.cache_data(show_spinner='Loading demo data...')
+def load_demo_data():
+    # Download the parquet file from your Dataset Repo
+    file_path = hf_hub_download(
+        repo_id="tkbarb10/ads505-review-data",
+        repo_type="dataset",
+        filename="demo_data.parquet"
+    )
+    return pd.read_parquet(file_path)

utils/load_user_data.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import pandas as pd
+import streamlit as st
+def load_uploaded_file(uploaded_file) -> pd.DataFrame | None:
+    filename = uploaded_file.name.lower()
+    if filename.endswith(".csv"):
+        return pd.read_csv(uploaded_file)
+    elif filename.endswith(".parquet"):
+        return pd.read_parquet(uploaded_file)
+    else:
+        st.error("Unsupported file type. Please upload a CSV or Parquet file.")
+        return

utils/plot_gains.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import pandas as pd
+import streamlit as st
+import plotly.graph_objects as go #type: ignore
+def plot_gains(y_true, y_probs):
+    # Build and sort dataframe
+    df = pd.DataFrame({
+        'Actual': y_true,
+        'Predicted': y_probs
+    }).sort_values(by='Predicted', ascending=False).reset_index(drop=True)
+    # Compute cumulative gain
+    df['Cumulative Percent'] = df['Actual'].cumsum() / df['Actual'].sum()
+    df['Percent of Data'] = (df.index + 1) / len(df)
+    # Compute K-stat (max distance from curve)
+    df['ks_stat'] = df['Cumulative Percent'] - df['Percent of Data']
+    ks_value = df['ks_stat'].max()
+    ks_idx = df['ks_stat'].idxmax()
+    cum_percent = df['Cumulative Percent'][ks_idx]
+    data_percent = df['Percent of Data'][ks_idx]
+    # Plotly figure
+    fig = go.Figure()
+    # Model Gains Curve
+    fig.add_trace(go.Scatter(
+        x=df['Percent of Data'],
+        y=df['Cumulative Percent'],
+        mode='lines',
+        name='Model Gains Curve',
+        line=dict(width=3)
+    ))
+    # Random baseline
+    fig.add_trace(go.Scatter(
+        x=[0, 1],
+        y=[0, 1],
+        mode='lines',
+        name='Random Baseline',
+        line=dict(width=2, dash='dash', color='gray')
+    ))
+    fig.add_annotation(
+        x=data_percent,
+        y=cum_percent,
+        text=f'Best Returns: {data_percent*100:.2f}%'
+    )
+    fig.update_layout(
+        title="Gains Curve",
+        xaxis_title="Percent of Data",
+        yaxis_title="Percent of Total Positive Cases Captured",
+        template="plotly_white",
+        height=450,
+        legend=dict(yanchor="bottom", y=0, xanchor="right", x=1)
+    )
+    st.session_state.ks_value = ks_value
+    st.session_state.peak_gains = round(data_percent * 100, 2)
+    st.session_state.percent_data = round(cum_percent * 100, 2)
+    return fig, data_percent, cum_percent

utils/prepare_user_dataframe.py ADDED Viewed

	@@ -0,0 +1,234 @@

+import streamlit as st
+import pandas as pd
+import re
+import nltk  # type: ignore
+from nltk.stem import WordNetLemmatizer  # type: ignore
+from nltk.corpus import wordnet  # type: ignore
+from nltk.tokenize import word_tokenize  # type: ignore
+from typing import Optional
+# -------------------------------------------------------------------
+# HTML cleaning and lemmatization helpers
+# -------------------------------------------------------------------
+def remove_user_html_tags(text: str) -> str:
+    """Remove basic HTML entities/tags and lowercase the text.
+    This preserves the original behavior used when training the model.
+    """
+    if text is None:
+        return ""
+    # Replace common HTML entities with their corresponding characters
+    text = text.replace('&#34;', '"')    # Replace "
+    text = text.replace('&quot;', '"')   # Also replace the named entity for "
+    text = text.replace('&apos;', "'")   # Replace '
+    text = text.replace('&#39;', "'")    # Also replace the numeric entity for '
+    text = text.replace('&amp;', '&')    # Replace &
+    text = text.replace('<br />', ' ')   # Replace line breaks with a space
+    text = text.replace('<br>', ' ')     # Also handle <br>
+    # Use regex to remove any remaining HTML tags (e.g., <p>, <div>, <span>)
+    clean_text = re.sub(r'<[^>]+>', '', text)
+    return clean_text.lower()
+def get_wordnet_pos(treebank_tag: str) -> str:
+    """Converts treebank POS tags to WordNet POS tags."""
+    if treebank_tag.startswith('J'):
+        return wordnet.ADJ
+    elif treebank_tag.startswith('V'):
+        return wordnet.VERB
+    elif treebank_tag.startswith('N'):
+        return wordnet.NOUN
+    elif treebank_tag.startswith('R'):
+        return wordnet.ADV
+    else:
+        # Default to noun if the tag is not recognized
+        return wordnet.NOUN
+def lemmatize_user_text(text: str) -> str:
+    """Tokenizes, POS-tags, and lemmatizes a string of text."""
+    if not isinstance(text, str):
+        text = "" if text is None else str(text)
+    lemmatizer = WordNetLemmatizer()
+    # 1. Tokenize the text into words
+    tokens = word_tokenize(text)
+    # 2. Get the part-of-speech tag for each token
+    tagged_tokens = nltk.pos_tag(tokens)
+    # 3. Lemmatize each word with its corresponding POS tag
+    lemmatized_output = []
+    for word, tag in tagged_tokens:
+        pos = get_wordnet_pos(tag)
+        lemma = lemmatizer.lemmatize(word, pos=pos)
+        lemmatized_output.append(lemma)
+    return " ".join(lemmatized_output)
+def prep_text_column(
+    df: pd.DataFrame,
+    text_col: str,
+    lemma_col: str,
+    overwrite: bool = False,
+) -> pd.DataFrame:
+    """
+    Column-agnostic helper to clean HTML and create a lemma column.
+    - If lemma_col already exists and overwrite=False, we return df unchanged.
+    - Otherwise we copy df and do the expensive cleaning + lemmatization.
+    """
+    # ✅ Fast path: if lemma already exists and we don't want to recompute, just reuse it
+    if lemma_col in df.columns and not overwrite:
+        return df
+    else:
+    # Only now do we copy and do heavy work
+        df_out = df.copy()
+        if text_col not in df_out.columns:
+            raise KeyError(f"Column '{text_col}' not found in dataframe.")
+        df_out[text_col] = (
+            df_out[text_col]
+            .fillna("")
+            .apply(remove_user_html_tags)
+            .astype(str)
+            .str.strip()
+        )
+        df_out[lemma_col] = df_out[text_col].apply(lemmatize_user_text)
+        return df_out
+# -------------------------------------------------------------------
+# Internal text prep for prediction
+# -------------------------------------------------------------------
+@st.cache_data(show_spinner='Prepping data!')
+def _prep_user_text(df: pd.DataFrame) -> pd.DataFrame:
+    """Prepare core text columns for the prediction model.
+    This function:
+    - Ensures HTML cleaning + lemmatization for:
+        * 'text'         -> 'lemma_text'
+        * 'review_title' -> 'lemma_title'
+    - Ensures the length features:
+        * 'Review Length'
+        * 'Title Length'
+    It is safe to call even if some of these columns already exist; in that case,
+    lemmatization is skipped and only length features are added if needed.
+    """
+    work_df = df.copy()
+    # Only lemmatize if the lemma columns are missing
+    if 'lemma_text' not in work_df.columns:
+        work_df = prep_text_column(work_df, text_col='text', lemma_col='lemma_text')
+    if 'lemma_title' not in work_df.columns:
+        work_df = prep_text_column(work_df, text_col='review_title', lemma_col='lemma_title')
+    # Ensure length features (only create if missing)
+    if 'Review Length' not in work_df.columns:
+        work_df['Review Length'] = work_df['text'].fillna('').apply(len)
+    if 'Title Length' not in work_df.columns:
+        work_df['Title Length'] = work_df['review_title'].fillna('').apply(len)
+    return work_df
+# -------------------------------------------------------------------
+# Public entry point used by the Streamlit app
+# -------------------------------------------------------------------
+def prep_text_and_features(model, df: Optional[pd.DataFrame] = None) -> None:
+    """Run text prep and feature assembly, storing results in session_state.
+    Behavior:
+    - If `df` is None, uses `st.session_state.raw_df` (current app behavior).
+    - Checks that required columns are present for the predictive model.
+    - Ensures HTML+lemma for title and text, and creates:
+        * 'Review Length'
+        * 'Title Length'
+        * 'vote' (binary target: 1 if helpful_vote > 0 else 0)
+    - Builds the feature matrix X based on `model.feature_names_in_`:
+        ['lemma_title', 'lemma_text', 'images', 'Review Length', 'Title Length']
+    - Stores:
+        * prepped_df
+        * X
+        * true_y
+        * prep_done flag
+        * resets downstream prediction state
+    """
+    if df is None:
+        df = st.session_state.get('raw_df')
+    if df is None:
+        st.warning("Upload a dataframe first.")
+        return
+    # Make sure the core columns are present
+    required_cols = {'helpful_vote', 'review_title', 'text', 'images'}
+    missing = required_cols - set(df.columns)
+    if missing:
+        st.error(
+            "The uploaded dataframe is missing required columns: "
+            + ", ".join(sorted(missing))
+        )
+        return
+    # Core text prep (HTML + lemma + length features)
+    prepped = _prep_user_text(df)
+    # Create binary target
+    prepped["vote"] = prepped["helpful_vote"].apply(lambda x: 1 if x > 0 else 0)
+    # Assemble features expected by the predictive model
+    # Your model expects:
+    #   'lemma_title', 'lemma_text', 'images', 'Review Length', 'Title Length'
+    # We still respect model.feature_names_in_ for robustness.
+    feature_cols = list(getattr(model, "feature_names_in_", [])) or [
+        "lemma_title",
+        "lemma_text",
+        "images",
+        "Review Length",
+        "Title Length",
+    ]
+    # Keep only columns that actually exist
+    feature_cols = [c for c in feature_cols if c in prepped.columns]
+    if not feature_cols:
+        st.error(
+            "No valid feature columns found for the model. Expected something like: "
+            "lemma_title, lemma_text, images, Review Length, Title Length."
+        )
+        return
+    X = prepped[feature_cols]
+    true_y = prepped["vote"]
+    # Store in session_state for downstream use
+    st.session_state.prepped_df = prepped
+    st.session_state.X = X
+    st.session_state.true_y = true_y
+    st.session_state.prep_done = True
+    # Reset downstream state if re-prepping
+    st.session_state.probs = None
+    st.session_state.model_run = False

utils/remove_html.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import re
+def remove_html_tags(text):
+    # Replace common HTML entities with their corresponding characters
+    text = text.replace('&#34;', '"')    # Replace "
+    text = text.replace('&quot;', '"')   # Also replace the named entity for "
+    text = text.replace('&apos;', "'")   # Replace '
+    text = text.replace('&#39;', "'")    # Also replace the numeric entity for '
+    text = text.replace('&amp;', '&')    # Replace &
+    text = text.replace('<br />', ' ')   # Replace line breaks with a space
+    text = text.replace('<br>', ' ')     # Also handle <br>
+    # Use regex to remove any remaining HTML tags (e.g., <p>, <div>, <span>)
+    clean_text = re.sub(r'<[^>]+>', '', text)
+    return clean_text.lower()

utils/topically.py ADDED Viewed

	@@ -0,0 +1,163 @@

+from __future__ import annotations
+from pathlib import Path
+from typing import List, Optional, Tuple
+import numpy as np
+import pandas as pd
+from sklearn.compose import ColumnTransformer
+from sklearn.decomposition import NMF
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.pipeline import make_pipeline, Pipeline
+from utils.build_plotly import _build_topic_figure
+import plotly.graph_objects as go # type: ignore
+import streamlit as st
+from nltk.corpus import stopwords  # type: ignore
+from utils.remove_html import remove_html_tags
+# --------- Defaults / Paths ---------
+# ROOT = Path(__file__).resolve().parents[1]
+# DEFAULT_DATA_DIR = ROOT / "review_data"
+from huggingface_hub import snapshot_download
+@st.cache_resource
+def get_data_directory():
+    # This downloads the whole review_data folder from your Dataset repo
+    data_path = snapshot_download(
+        repo_id="tkbarb10/ads505-review-data",
+        repo_type="dataset"
+    )
+    return Path(data_path) / "review_data"
+DEFAULT_DATA_DIR = get_data_directory()
+COLOR_WHEEL = {
+    "All_Beauty": "#d946ef",            # magenta-ish
+    "Appliances": "#800000",            # maroon
+    "Baby_Products": "#87ceeb",         # skyblue
+    "Electronics": "#ffd700",           # gold
+    "Health_and_Household": "#3cb371",  # mediumseagreen
+    "Movies_and_TV": "#663399"          # rebeccapurple
+}
+# Build stopword list (don’t mutate across calls)
+BASE_STOPWORDS = set(stopwords.words("english"))
+CUSTOM_KEEP = {
+    'not','no','but','ain','don',"don't",'aren',"aren't",'couldn',"couldn't",
+    'didn',"didn't",'doesn',"doesn't",'hadn',"hadn't",'hasn',"hasn't",'haven',
+    "haven't",'isn',"isn't",'mightn',"mightn't",'mustn',"mustn't",'needn',
+    "needn't",'shan',"shan't",'shouldn',"shouldn't",'wasn',"wasn't",'weren',
+    "weren't",'won',"won't",'wouldn',"wouldn't",'very','too'
+}
+DEFAULT_STOPWORDS = sorted(list(BASE_STOPWORDS - CUSTOM_KEEP))
+# --------- Data loading / modeling ---------
+def _load_category_df(
+    data_dir: Path | str,
+    category: str,
+    lemmatize: bool,
+    nrows: int
+) -> pd.DataFrame:
+    """Load parquet for category; choose lemma or raw; basic cleaning."""
+    data_dir = Path(data_dir)
+    path = data_dir / f"{category}.parquet"
+    lemma_path = data_dir / f"lemma_data/{category}.parquet"
+    if lemmatize:
+        df = pd.read_parquet(lemma_path)
+    else:
+        df = pd.read_parquet(path)
+        if "text" in df.columns:
+            df["text"] = df["text"].astype(str).str.strip().apply(remove_html_tags)
+    return df.iloc[:nrows, :].copy()
+#@st.cache_data(show_spinner="One moment please!", show_time=True)
+def make_topics(
+    category: str,
+    topic_columns: str,
+    lemmatize: bool,
+    n1: int,
+    n2: int,
+    n_components: int,
+    rating: Optional[List[int]] = None,
+    helpful_vote: Optional[int] = None,
+    new_words: Optional[List[str]] = None,
+    n_top_words: int = 5,
+    data_dir: Optional[str | Path] = None,
+    nrows: int = 10_000
+) -> Tuple[ColumnTransformer | Pipeline, go.Figure]:
+    """
+    Fit TF-IDF + NMF topic model and return (pipeline, Plotly figure).
+    Returns:
+        (topic_pipeline, fig)
+    """
+    data_dir = data_dir or DEFAULT_DATA_DIR
+    df = _load_category_df(data_dir, category, lemmatize, nrows=nrows)
+    # Optional filters
+    if rating is not None and "rating" in df.columns:
+        df = df[df["rating"].isin(rating)]
+    if helpful_vote is not None and "helpful_vote" in df.columns:
+        df = df[df["helpful_vote"] > helpful_vote]
+    # Columns to model
+    topic_columns = (topic_columns or "").strip().lower()
+    # Make a fresh stopword list each call to avoid global mutation
+    stop_list = list(DEFAULT_STOPWORDS)
+    if new_words:
+        stop_list.extend(new_words)
+    tfidf_text = TfidfVectorizer(stop_words=stop_list, ngram_range=(n1, n2))
+    tfidf_title = TfidfVectorizer(stop_words=stop_list, ngram_range=(n1, n2))
+    if topic_columns == "both":
+        preprocessor = ColumnTransformer([
+            ("title", tfidf_title, "title"),
+            ("text", tfidf_text, "text")
+        ])
+    elif topic_columns == "text":
+        preprocessor = ColumnTransformer([("text", tfidf_text, "text")])
+    else:
+        # default to title if not 'both' or 'text'
+        preprocessor = ColumnTransformer([("title", tfidf_title, "title")])
+    nmf = NMF(
+        n_components=n_components,
+        init="nndsvda",
+        solver="mu",
+        beta_loss=1,
+        random_state=10
+    )
+    topic_pipeline = make_pipeline(preprocessor, nmf)
+    # Fit on only the columns the preprocessor expects
+    fit_cols = [c for c in ["title", "text"] if c in df.columns]
+    topic_pipeline.fit(df[fit_cols])
+    feature_names = topic_pipeline[0].get_feature_names_out()
+    nmf_model: NMF = topic_pipeline[1]
+    # Choose color from map (fallback if category label differs)
+    bar_color = COLOR_WHEEL.get(category, "#184A90")
+    fig = _build_topic_figure(
+        model=nmf_model,
+        feature_names=feature_names,
+        n_top_words=n_top_words,
+        title=category,
+        n_components=n_components,
+        bar_color=bar_color
+    )
+    return topic_pipeline, fig

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff