Taylor Kirk commited on
Commit
5d4981c
·
0 Parent(s):

Fresh deployment after moving datasets to hf datahub

Browse files
.gitignore ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Custom ignore
2
+ models/sgdc_pipeline.joblib
3
+ review_data/
4
+ models/demo_data.parquet
5
+
6
+ # Byte-compiled / optimized / DLL files
7
+ __pycache__/
8
+ *.py[codz]
9
+ *$py.class
10
+
11
+ # C extensions
12
+ *.so
13
+
14
+ # Distribution / packaging
15
+ .Python
16
+ build/
17
+ develop-eggs/
18
+ dist/
19
+ downloads/
20
+ eggs/
21
+ .eggs/
22
+ lib/
23
+ lib64/
24
+ parts/
25
+ sdist/
26
+ var/
27
+ wheels/
28
+ share/python-wheels/
29
+ *.egg-info/
30
+ .installed.cfg
31
+ *.egg
32
+ MANIFEST
33
+
34
+ # PyInstaller
35
+ # Usually these files are written by a python script from a template
36
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
37
+ *.manifest
38
+ *.spec
39
+
40
+ # Installer logs
41
+ pip-log.txt
42
+ pip-delete-this-directory.txt
43
+
44
+ # Unit test / coverage reports
45
+ htmlcov/
46
+ .tox/
47
+ .nox/
48
+ .coverage
49
+ .coverage.*
50
+ .cache
51
+ nosetests.xml
52
+ coverage.xml
53
+ *.cover
54
+ *.py.cover
55
+ .hypothesis/
56
+ .pytest_cache/
57
+ cover/
58
+
59
+ # Translations
60
+ *.mo
61
+ *.pot
62
+
63
+ # Django stuff:
64
+ *.log
65
+ local_settings.py
66
+ db.sqlite3
67
+ db.sqlite3-journal
68
+
69
+ # Flask stuff:
70
+ instance/
71
+ .webassets-cache
72
+
73
+ # Scrapy stuff:
74
+ .scrapy
75
+
76
+ # Sphinx documentation
77
+ docs/_build/
78
+
79
+ # PyBuilder
80
+ .pybuilder/
81
+ target/
82
+
83
+ # Jupyter Notebook
84
+ .ipynb_checkpoints
85
+
86
+ # IPython
87
+ profile_default/
88
+ ipython_config.py
89
+
90
+ # pyenv
91
+ # For a library or package, you might want to ignore these files since the code is
92
+ # intended to run in multiple environments; otherwise, check them in:
93
+ # .python-version
94
+
95
+ # pipenv
96
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
97
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
98
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
99
+ # install all needed dependencies.
100
+ #Pipfile.lock
101
+
102
+ # UV
103
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
104
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
105
+ # commonly ignored for libraries.
106
+ #uv.lock
107
+
108
+ # poetry
109
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
110
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
111
+ # commonly ignored for libraries.
112
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
113
+ #poetry.lock
114
+ #poetry.toml
115
+
116
+ # pdm
117
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
118
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
119
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
120
+ #pdm.lock
121
+ #pdm.toml
122
+ .pdm-python
123
+ .pdm-build/
124
+
125
+ # pixi
126
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
127
+ #pixi.lock
128
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
129
+ # in the .venv directory. It is recommended not to include this directory in version control.
130
+ .pixi
131
+
132
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
133
+ __pypackages__/
134
+
135
+ # Celery stuff
136
+ celerybeat-schedule
137
+ celerybeat.pid
138
+
139
+ # SageMath parsed files
140
+ *.sage.py
141
+
142
+ # Environments
143
+ .env
144
+ .envrc
145
+ .venv
146
+ env/
147
+ venv/
148
+ ENV/
149
+ env.bak/
150
+ venv.bak/
151
+
152
+ # Spyder project settings
153
+ .spyderproject
154
+ .spyproject
155
+
156
+ # Rope project settings
157
+ .ropeproject
158
+
159
+ # mkdocs documentation
160
+ /site
161
+
162
+ # mypy
163
+ .mypy_cache/
164
+ .dmypy.json
165
+ dmypy.json
166
+
167
+ # Pyre type checker
168
+ .pyre/
169
+
170
+ # pytype static type analyzer
171
+ .pytype/
172
+
173
+ # Cython debug symbols
174
+ cython_debug/
175
+
176
+ # PyCharm
177
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
178
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
179
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
180
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
181
+ #.idea/
182
+
183
+ # Abstra
184
+ # Abstra is an AI-powered process automation framework.
185
+ # Ignore directories containing user credentials, local state, and settings.
186
+ # Learn more at https://abstra.io/docs
187
+ .abstra/
188
+
189
+ # Visual Studio Code
190
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
191
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
192
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
193
+ # you could uncomment the following to ignore the entire vscode folder
194
+ # .vscode/
195
+
196
+ # Ruff stuff:
197
+ .ruff_cache/
198
+
199
+ # PyPI configuration file
200
+ .pypirc
201
+
202
+ # Cursor
203
+ # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
204
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
205
+ # refer to https://docs.cursor.com/context/ignore-files
206
+ .cursorignore
207
+ .cursorindexingignore
208
+
209
+ # Marimo
210
+ marimo/_static/
211
+ marimo/_lsp/
212
+ __marimo__/
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.12
Dockerfile ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 1. Match your local Python version
2
+ FROM python:3.12-slim
3
+
4
+ # 2. Install uv
5
+ COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
6
+
7
+ WORKDIR /app
8
+
9
+ # 3. Install system dependencies
10
+ RUN apt-get update && apt-get install -y \
11
+ build-essential \
12
+ curl \
13
+ && rm -rf /var/lib/apt/lists/*
14
+
15
+ # 4. Copy and install dependencies
16
+ COPY pyproject.toml uv.lock ./
17
+ RUN uv pip install --system --no-cache -r pyproject.toml
18
+
19
+ # 5. Pre-download NLTK data (Matches your app.py list)
20
+ RUN python3 -m nltk.downloader stopwords wordnet omw-1.4 punkt_tab averaged_perceptron_tagger_eng
21
+
22
+ # 6. Copy the rest of the code
23
+ COPY . .
24
+
25
+ EXPOSE 7860
26
+
27
+ ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=7860", "--server.address=0.0.0.0"]
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
README.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: ADS 505 Review Analytics
3
+ emoji: 🌐
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: docker
7
+ pinned: false
8
+ ---
app.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import nltk
3
+
4
+ @st.cache_resource
5
+ def setup_nltk():
6
+ resources = [
7
+ ('corpora/stopwords', 'stopwords'),
8
+ ('corpora/wordnet', 'wordnet'),
9
+ ('corpora/omw-1.4', 'omw-1.4'),
10
+ ('tokenizers/punkt_tab', 'punkt_tab'),
11
+ ('taggers/averaged_perceptron_tagger_eng', 'averaged_perceptron_tagger_eng')
12
+ ]
13
+ for resource_path, package_name in resources:
14
+ try:
15
+ nltk.data.find(resource_path)
16
+ except LookupError:
17
+ nltk.download(package_name)
18
+
19
+ setup_nltk()
20
+
21
+ main_page = st.Page("pages/main.py", title="Home", icon="🏠")
22
+ topic_model = st.Page("pages/topic.py", title="Topic Modeling", icon="🌐")
23
+ modeling = st.Page("pages/model.py", title='Predictive Modeling', icon='🎱')
24
+ user = st.Page("pages/user.py", title='User Page', icon="👤")
25
+
26
+ pg = st.navigation({"Pages": [main_page, topic_model, modeling, user]})
27
+
28
+ pg.run()
pages/__init__.py ADDED
File without changes
pages/main.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from tabs.main_page_tabs.dataset_overview import render as render_overview
3
+ from tabs.main_page_tabs.univariate_analysis import render as render_uni
4
+ from tabs.main_page_tabs.bivariate_analysis import render as render_bi
5
+ from tabs.main_page_tabs.target_variable import render as render_target
6
+ from tabs.main_page_tabs.home import render as render_home
7
+ from tabs.main_page_tabs.text_analysis import render as render_text
8
+
9
+ # Page Title
10
+ st.set_page_config(
11
+ page_title='Amazon Reviews',
12
+ layout='wide',
13
+ page_icon=":panda_face:"
14
+ )
15
+
16
+ st.title('Explore the wild world of Amazon Reviews')
17
+
18
+ tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs(["About", 'Dataset Overview', 'Univariate Analysis', 'Bivariate Analysis', 'Target', 'Text Analysis'])
19
+
20
+ with tab1:
21
+ render_home()
22
+
23
+ with tab2:
24
+ render_overview()
25
+
26
+ if st.session_state.explore_df is not None:
27
+ with tab3:
28
+ render_uni(st.session_state.explore_df)
29
+
30
+ with tab4:
31
+ render_bi(st.session_state.explore_df)
32
+
33
+ with tab5:
34
+ render_target(st.session_state.explore_df)
35
+
36
+ with tab6:
37
+ render_text(st.session_state.explore_df)
38
+
39
+
40
+
41
+
pages/model.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from tabs.predictive_model_tabs.pred_model_one import render as render_about
3
+ from tabs.predictive_model_tabs.pred_model_two import render as render_demo
4
+
5
+ tab1, tab2 = st.tabs(['🤷‍♂️ About', ':star2: Demo'])
6
+
7
+ with tab1:
8
+ render_about()
9
+
10
+ with tab2:
11
+ render_demo()
12
+
13
+
14
+
pages/topic.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from utils.topically import make_topics
3
+
4
+ st.set_page_config(layout="wide")
5
+
6
+ DATA_OPTIONS = {
7
+ 'Beauty': 'All_Beauty',
8
+ 'Appliances': 'Appliances',
9
+ 'Baby Products': 'Baby_Products',
10
+ 'Electronics': 'Electronics',
11
+ 'Health and Household': 'Health_and_Household',
12
+ 'Movies and TV': 'Movies_and_TV'
13
+ }
14
+
15
+ st.markdown("# Topic Modeling")
16
+
17
+ cat = st.sidebar.selectbox(
18
+ "Choose the dataset to model",
19
+ tuple(DATA_OPTIONS.keys()),
20
+ index=None
21
+ )
22
+ column = st.sidebar.selectbox("Choose a column to model", ("Text", "Title", "Both"), index=None)
23
+
24
+ if cat and column:
25
+ category = DATA_OPTIONS[cat]
26
+ topic_pipeline, fig = make_topics(
27
+ category=category,
28
+ topic_columns=column,
29
+ lemmatize=True, # or False
30
+ n1=2,
31
+ n2=3,
32
+ n_components=5,
33
+ rating=[1, 2], # optional
34
+ helpful_vote=0, # optional
35
+ new_words=None,
36
+ n_top_words=5,
37
+ # data_dir="path/to/review_data" # optional override if needed
38
+ )
39
+ st.plotly_chart(fig, use_container_width=True, config={"scrollZoom": True})
pages/user.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from tabs import render_about
3
+ from tabs import render_topic
4
+ from tabs import render_pred
5
+ from tabs import render_analysis
6
+
7
+ tab1, tab2, tab3, tab4 = st.tabs(['🤷‍♂️ About', ':star2: Topic', '▶️ Predictive', "💬 Chat Analysis"])
8
+
9
+ with tab1:
10
+ render_about()
11
+
12
+ with tab2:
13
+ render_topic()
14
+
15
+ with tab3:
16
+ render_pred()
17
+
18
+ with tab4:
19
+ render_analysis()
pyproject.toml ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "final-project"
3
+ version = "0.1.0"
4
+ description = "Analyzing customer review data from Amazon for final project for ADS505"
5
+ authors = [
6
+ {name = "Taylor Kirk",email = "tkirk@sandiego.edu"},
7
+ {name = "Sushama Kafle",email = "skafle@sandiego.edu"},
8
+ {name = "Luigi Salemi",email = "lsalemi@sandiego.edu"}
9
+ ]
10
+ license = {text = "Apache 2.0"}
11
+ readme = "README.md"
12
+ requires-python = ">=3.12"
13
+ dependencies = [
14
+ "requests (>=2.32.5,<3.0.0)",
15
+ "pandas (>=2.3.2,<3.0.0)",
16
+ "scipy (>=1.16.2,<2.0.0)",
17
+ "scikit-learn (>=1.7.2,<2.0.0)",
18
+ "matplotlib (>=3.10.6,<4.0.0)",
19
+ "plotly (>=6.3.0,<7.0.0)",
20
+ "ipykernel (>=6.30.1,<7.0.0)",
21
+ "nbformat (>=5.10.4,<6.0.0)",
22
+ "nltk",
23
+ "streamlit",
24
+ "seaborn",
25
+ "emoji",
26
+ "openai>=2.13.0",
27
+ "wordcloud>=1.9.4",
28
+ "textstat>=0.7.12",
29
+ "pathlib>=1.0.1",
30
+ "joblib>=1.5.2",
31
+ ]
32
+
tabs/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Tabs module for Streamlit EDA app
2
+
3
+ from .user_page_tabs import *
tabs/main_page_tabs/__init__.py ADDED
File without changes
tabs/main_page_tabs/bivariate_analysis.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Bivariate Analysis Tab
3
+ This tab displays relationships between variables:
4
+ - Scatter plots (e.g., Price vs Average Rating)
5
+ - Correlation heatmap (triangular, using Plotly)
6
+ """
7
+
8
+ import streamlit as st
9
+ import pandas as pd
10
+ import numpy as np
11
+ import plotly.express as px
12
+ import plotly.graph_objects as go
13
+ from utils.icons import lucide_icon
14
+
15
+
16
+ def render(df):
17
+ """
18
+ Render the Bivariate Analysis tab.
19
+
20
+ Args:
21
+ df (pd.DataFrame): The main dataset
22
+ """
23
+ st.markdown(
24
+ f'<h2 class="section-header icon-header">{lucide_icon("git-merge", size=28)} Bivariate Analysis</h2>',
25
+ unsafe_allow_html=True
26
+ )
27
+
28
+ with st.form('Bivariate Form'):
29
+ col1, col2, col3 = st.columns(3)
30
+
31
+ with col1:
32
+ st.subheader("Select First Column")
33
+ first_choice = st.selectbox(
34
+ "First Column",
35
+ options=df.columns,
36
+ key='first_col_select',
37
+ index=None,
38
+ placeholder='--Select Column--'
39
+ )
40
+
41
+ with col2:
42
+ st.subheader("Select Second Column")
43
+ second_choice = st.selectbox(
44
+ "Second Column",
45
+ options=df.columns,
46
+ key='second_col_select',
47
+ index=None,
48
+ placeholder='--Select Column--'
49
+ )
50
+
51
+ with col3:
52
+ st.subheader("Plot Choice")
53
+ plot_choice = st.selectbox(
54
+ 'Select Plot',
55
+ ['Scatter', 'Correlation'],
56
+ index=None,
57
+ placeholder="--Select Plot Type--"
58
+ )
59
+
60
+ submitted = st.form_submit_button("Plot Away")
61
+
62
+
63
+ # Scatter Plot
64
+ if plot_choice == 'Scatter':
65
+ scatter_fig = px.scatter(
66
+ df,
67
+ x=first_choice,
68
+ y=second_choice,
69
+ title=f'<b>Compare {first_choice} and {second_choice}</b>',
70
+ color_discrete_sequence=['darkgreen'],
71
+ opacity=0.6
72
+ )
73
+ st.plotly_chart(scatter_fig, use_container_width=True)
74
+
75
+ # Correlation Analysis
76
+ if plot_choice == 'Correlation':
77
+ st.markdown(
78
+ f"<h3>{lucide_icon('link', size=20)} Correlation Analysis<h3>",
79
+ unsafe_allow_html=True
80
+ )
81
+
82
+ corr_matrix = df.loc[:, [first_choice, second_choice]].corr()
83
+
84
+ # Create mask for upper triangle (including diagonal)
85
+ mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
86
+
87
+ # Apply mask - set upper triangle to NaN
88
+ corr_masked = corr_matrix.mask(mask)
89
+
90
+ # Create custom text for annotations (only show values for lower triangle)
91
+ text_values = []
92
+ for i in range(len(corr_matrix)):
93
+ row_text = []
94
+ for j in range(len(corr_matrix)):
95
+ if i > j: # Lower triangle only
96
+ row_text.append(f"{corr_matrix.iloc[i, j]:.2f}")
97
+ else:
98
+ row_text.append("")
99
+ text_values.append(row_text)
100
+
101
+ # Create Plotly heatmap
102
+ fig_corr = go.Figure(data=go.Heatmap(
103
+ z=corr_masked.values,
104
+ x=corr_masked.columns,
105
+ y=corr_masked.index,
106
+ colorscale='RdBu_r',
107
+ zmid=0,
108
+ zmin=-1,
109
+ zmax=1,
110
+ text=text_values,
111
+ texttemplate='%{text}',
112
+ textfont={"size": 12},
113
+ colorbar=dict(
114
+ title=dict(text="Correlation", side="right"),
115
+ tickmode="linear",
116
+ tick0=-1,
117
+ dtick=0.2
118
+ ),
119
+ hoverongaps=False,
120
+ hovertemplate='%{y} vs %{x}<br>Correlation: %{z:.3f}<extra></extra>'
121
+ ))
122
+
123
+ fig_corr.update_layout(
124
+ title='<b>Correlation Matrix (Lower Triangle)</b>',
125
+ xaxis_title="",
126
+ yaxis_title="",
127
+ xaxis={'side': 'bottom'},
128
+ yaxis={'autorange': 'reversed'},
129
+ width=700,
130
+ height=600,
131
+ plot_bgcolor='rgba(0,0,0,0)',
132
+ paper_bgcolor='rgba(0,0,0,0)'
133
+ )
134
+
135
+ # Update axes to show all labels
136
+ fig_corr.update_xaxes(tickangle=45)
137
+
138
+ st.plotly_chart(fig_corr, width='stretch')
139
+
tabs/main_page_tabs/dataset_overview.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Dataset Overview Tab
3
+ This tab displays basic information about the dataset including:
4
+ - First few rows of data
5
+ - Data types and missing values
6
+ - Summary statistics
7
+ """
8
+
9
+ import streamlit as st
10
+ import pandas as pd
11
+ import os
12
+ from pathlib import Path
13
+ from utils.icons import lucide_icon
14
+ from utils.load_data import load_dataset
15
+
16
+ def find_data_dir():
17
+ current = Path(__file__).resolve()
18
+ for parent in current.parents:
19
+ if (parent / "review_data").exists():
20
+ return parent / "review_data"
21
+ return None
22
+
23
+ DATA_DIR = find_data_dir()
24
+
25
+ def render():
26
+ st.sidebar.header('Data')
27
+
28
+ # Convert Path object back to string for os.path functions if needed
29
+ if not os.path.isdir(str(DATA_DIR)):
30
+ st.sidebar.error(f"Missing folder. Currently looking at: {DATA_DIR}")
31
+ else:
32
+ files = sorted([f for f in os.listdir(DATA_DIR) if f.lower().endswith((".csv", ".parquet"))])
33
+ if files:
34
+ selected = st.sidebar.selectbox("Choose a dataset", ("-- Select a category --", 'Beauty', 'Appliances', 'Baby Products', 'Electronics', 'Health and Household', 'Movies and TV'))
35
+ if selected == "-- Select a category --":
36
+ st.stop()
37
+ else:
38
+ st.sidebar.write("You selected:", selected)
39
+ df = load_dataset(DATA_DIR, selected) #type: ignore
40
+ df.drop(columns=['images', 'asin', 'parent_asin', 'user_id'], inplace=True, errors='ignore')
41
+ else:
42
+ st.sidebar.warning("No .csv or .parquet files found in review_data/.")
43
+
44
+
45
+ st.markdown(
46
+ f'<h2 class="section-header icon-header">{lucide_icon("layout-dashboard", size=28)} Dataset Overview</h2>',
47
+ unsafe_allow_html=True
48
+ )
49
+
50
+ # Basic Information
51
+ st.markdown(
52
+ f'<h3>{lucide_icon("table", size=20)} Basic Information</h3>',
53
+ unsafe_allow_html=True
54
+ )
55
+ st.dataframe(df.head(), use_container_width=True)
56
+
57
+ # Data Types & Missing Values
58
+ st.markdown(
59
+ f"<h3>{lucide_icon('info', size=20)} Data Types & Missing Values<h3>",
60
+ unsafe_allow_html=True
61
+ )
62
+ info_df = pd.DataFrame({
63
+ 'Column': df.columns,
64
+ 'Data Type': df.dtypes.astype(str),
65
+ 'Non-Null Count': df.count(),
66
+ 'Missing Values': df.isnull().sum(),
67
+ 'Missing %': (df.isnull().sum() / len(df) * 100).round(2)
68
+ })
69
+ st.dataframe(info_df, use_container_width=True)
70
+
71
+ # Summary statistics
72
+ st.markdown(
73
+ f"<h3>{lucide_icon('calculator', size=20)} Summary Statistics<h3>",
74
+ unsafe_allow_html=True
75
+ )
76
+
77
+ st.dataframe(df.describe(), use_container_width=True)
78
+
79
+ st.session_state.explore_df = df
80
+
tabs/main_page_tabs/home.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ def render():
4
+ st.markdown("""
5
+ <style>
6
+ @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;600&display=swap');
7
+ .custom-section {
8
+ font-family: 'Inter', sans-serif;
9
+ font-size: 16px;
10
+ line-height: 1.4;
11
+ }
12
+ .custom-section strong {
13
+ font-weight: 600;
14
+ color: #2E86AB;
15
+ display: inline-block;
16
+ margin-bottom: 8px;
17
+ }
18
+ </style>
19
+ """, unsafe_allow_html=True)
20
+
21
+ st.markdown("## :earth_asia: Navigation")
22
+
23
+ st.markdown("""
24
+ <div class="custom-section">
25
+ This home page outlines our project of using <a href="https://huggingface.co/datasets/McAuley-Lab/Amazon-Reviews-2023/tree/main">Amazon review data</a> to understand consumer behavior and help businesses improve their products and services. Select a dataset from the sidebar and use the tabs above to explore the data.
26
+ </div>
27
+ """, unsafe_allow_html=True)
28
+
29
+ st.markdown("\n")
30
+
31
+ st.info("""
32
+ **Available Tabs:**
33
+ - **Dataset Overview** — Basic information and summary statistics
34
+ - **Univariate Analysis** — Visualize individual columns with various plots
35
+ - **Bivariate Analysis** — Compare relationships between two columns
36
+ - **Target** — Analyze the target variable (helpful_votes)
37
+ - **Text Analysis** — Examine review titles and text content
38
+ """)
39
+
40
+ # New section for page navigation
41
+ st.markdown("### 📑 App Pages")
42
+ st.markdown("""
43
+ <div class="custom-section">
44
+ <strong>🏠 About (Current Page)</strong><br>
45
+ Learn about the project, explore our Amazon review datasets, and understand the data through various analyses.
46
+ <br><br>
47
+
48
+ <strong>🔍 Topic Modeling</strong><br>
49
+ Apply NMF [Non-negative Matrix Factorization](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html) topic modeling to discover themes in reviews. Choose a dataset and text column, then visualize the extracted topics and their distributions.
50
+ <br><br>
51
+
52
+ <strong>🤖 Predictive Modeling</strong><br>
53
+ Explore our machine learning model that predicts review helpfulness. See a live demo of how the model works and what features drive predictions.
54
+ <br><br>
55
+
56
+ <strong>👤 User Workspace</strong><br>
57
+ Your personal analysis environment where you can:
58
+ <ul>
59
+ <li>Upload your own customer review data</li>
60
+ <li>Explore and visualize</li>
61
+ <li>Apply topic modeling to your reviews</li>
62
+ <li>Use our predictive model on your data</li>
63
+ <li>Chat with an AI assistant to interpret results and gain insights</li>
64
+ </ul>
65
+ </div>
66
+ """, unsafe_allow_html=True)
67
+
68
+ st.divider()
69
+
70
+ st.markdown("## :blue[:material/description:] ADS505 Project Description")
71
+ st.markdown("""
72
+ <div class="custom-section">
73
+ This project analyzes Amazon customer review data to identify what makes a review helpful.
74
+ Using data science and machine learning, we build predictive models to surface the most helpful reviews and apply topic modeling to
75
+ uncover the key themes and characteristics of helpful reviews across product categories and ratings.
76
+ </div>
77
+ """, unsafe_allow_html=True)
78
+
79
+ st.divider()
80
+
81
+ st.markdown('## 🚩 Problem Statement')
82
+ st.markdown("""
83
+ <div class="custom-section">
84
+ <strong>What characteristics make an Amazon product review helpful?</strong><br>
85
+ By analyzing large-scale review datasets, this project identifies linguistic, structural, and contextual
86
+ features that correlate with helpfulness votes.
87
+ <br><br>
88
+
89
+ <div style="margin-top: 0px;">
90
+ <strong style="margin-bottom: 8px;">Our Goals:</strong>
91
+ <ol style="margin: 0; padding-left: 20px;">
92
+ <li><strong>Predictive Modeling</strong> — Develop models to identify reviews most likely to be found helpful</li>
93
+ <li><strong>Topic Analysis</strong> — Understand how helpful review characteristics differ by product category and rating</li>
94
+ </ol>
95
+ </div>
96
+
97
+ <div style="margin-top: 16px;">
98
+ <strong style="margin-bottom: 8px;">Impact:</strong>
99
+ Help users write better reviews, enable companies to highlight valuable feedback,
100
+ and provide insights for product and system improvements.
101
+ </div>
102
+ </div>
103
+ """, unsafe_allow_html=True)
tabs/main_page_tabs/target_variable.py ADDED
@@ -0,0 +1,281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Target Variable Analysis Tab
3
+ This tab provides detailed analysis of the target variable (helpful_vote):
4
+ - Distribution visualizations (raw and log scale)
5
+ - Helpfulness categories (Not Helpful vs Helpful)
6
+ - Category statistics
7
+ - Advanced analysis (box plots, cumulative distribution)
8
+ - Detailed vote count distribution
9
+ """
10
+
11
+ import streamlit as st
12
+ import pandas as pd
13
+ import numpy as np
14
+ import plotly.express as px
15
+ import plotly.graph_objects as go
16
+ from utils.icons import lucide_icon
17
+
18
+
19
+ def create_helpfulness_categories(df):
20
+ """
21
+ Create binary categories: helpful vs not helpful.
22
+
23
+ Args:
24
+ df (pd.DataFrame): The main dataset
25
+
26
+ Returns:
27
+ tuple: (df with new category column, bin_info dict)
28
+ """
29
+ helpful_votes = df['helpful_vote']
30
+
31
+ # Simple binary classification
32
+ # 0 votes = Not Helpful (no one found it helpful)
33
+ # 1+ votes = Helpful (at least one person found it helpful)
34
+ def categorize_helpfulness(votes):
35
+ if votes == 0:
36
+ return "Not Helpful (0 votes)"
37
+ else:
38
+ return "Helpful (1+ votes)"
39
+
40
+ df['helpfulness_category'] = helpful_votes.apply(categorize_helpfulness)
41
+
42
+ # Store bin info for display
43
+ bin_info = {
44
+ 'Not Helpful (0 votes)': "Reviews that received 0 helpful votes",
45
+ 'Helpful (1+ votes)': "Reviews that received 1 or more helpful votes"
46
+ }
47
+
48
+ return df, bin_info
49
+
50
+
51
+ def render(df):
52
+ """
53
+ Render the Target Variable Analysis tab.
54
+
55
+ Args:
56
+ df (pd.DataFrame): The main dataset
57
+ """
58
+ st.markdown(
59
+ f'<h2 class="section-header icon-header">{lucide_icon("target", size=28)} Target Variable Analysis</h2>',
60
+ unsafe_allow_html=True
61
+ )
62
+
63
+ st.info(f"""
64
+ **Classification Logic:**
65
+ - **Not Helpful (0 votes)**: Reviews that received 0 helpful votes - no one found them helpful
66
+ - **Helpful (1+ votes)**: Reviews that received 1 or more helpful votes - at least one person found them helpful
67
+ """)
68
+
69
+ st.markdown("\n")
70
+
71
+ if 'helpful_vote' in df.columns:
72
+ # Apply categorization
73
+ df_with_categories, bin_info = create_helpfulness_categories(df)
74
+
75
+ # Display bin information
76
+ st.markdown(
77
+ f"""
78
+ <h3 style="bottom-margin: -10px;"{lucide_icon('tags', size=20)} Helpfulness Categories
79
+ </h3>
80
+ """,
81
+ unsafe_allow_html=True
82
+ )
83
+
84
+
85
+ # Row 1: Distribution Analysis
86
+ col1, col2 = st.columns(2)
87
+
88
+ with col1:
89
+ # Original distribution
90
+ fig_target = px.histogram(
91
+ df,
92
+ x='helpful_vote',
93
+ title='<b>Distribution of Helpful Votes (Raw)</b>',
94
+ color_discrete_sequence=['#e74c3c'],
95
+ nbins=50
96
+ )
97
+ fig_target.update_layout(
98
+ xaxis_title="Number of Helpful Votes",
99
+ yaxis_title="Frequency"
100
+ )
101
+ st.plotly_chart(fig_target, use_container_width=True)
102
+
103
+ with col2:
104
+ # Log-scale distribution for better visualization
105
+ df_nonzero = df[df['helpful_vote'] > 0]
106
+ if len(df_nonzero) > 0:
107
+ fig_log = px.histogram(
108
+ df_nonzero,
109
+ x='helpful_vote',
110
+ title='<b>Distribution of Helpful Votes (Log Scale)</b>',
111
+ color_discrete_sequence=['#9b59b6'],
112
+ nbins=30,
113
+ log_y=True
114
+ )
115
+ fig_log.update_layout(
116
+ xaxis_title="Number of Helpful Votes",
117
+ yaxis_title="Frequency (Log Scale)"
118
+ )
119
+ st.plotly_chart(fig_log, use_container_width=True)
120
+ else:
121
+ st.markdown(
122
+ f"{lucide_icon('info', size=16)} No non-zero helpful votes to display in log scale",
123
+ unsafe_allow_html=True
124
+ )
125
+ st.markdown("\n")
126
+
127
+ # Row 2: Categorical Analysis
128
+ col1, col2 = st.columns([.6, .4], border=True)
129
+
130
+ with col1:
131
+ # Categorical distribution - using Luigi's bar chart style
132
+ category_counts = df_with_categories['helpfulness_category'].value_counts().reset_index()
133
+ category_counts.columns = ['Category', 'Count']
134
+
135
+ fig_categories = px.bar(
136
+ category_counts,
137
+ x='Count',
138
+ y='Category',
139
+ orientation='h',
140
+ title='<b>Distribution by Helpfulness Category</b>',
141
+ color_discrete_sequence=['#184A90'],
142
+ text='Count'
143
+ )
144
+ fig_categories.update_layout(
145
+ margin=dict(l=200, r=20, t=50, b=20),
146
+ yaxis={'categoryorder':'total ascending', 'title': ''},
147
+ xaxis_title='Count'
148
+ )
149
+ fig_categories.update_traces(texttemplate='%{text}', textposition='outside')
150
+ st.plotly_chart(fig_categories, use_container_width=True)
151
+
152
+ with col2:
153
+ # Category statistics
154
+ st.markdown(
155
+ f"""
156
+ <h3 style="margin-bottom: -15px;">{lucide_icon('target', size=20)} Category Statistics
157
+ </h3>
158
+ """,
159
+ unsafe_allow_html=True
160
+ )
161
+
162
+ category_stats = []
163
+ for category in ['Not Helpful (0 votes)', 'Helpful (1+ votes)']:
164
+ count = (df_with_categories['helpfulness_category'] == category).sum()
165
+ percentage = (count / len(df_with_categories)) * 100
166
+ category_stats.append({
167
+ 'Category': category,
168
+ 'Count': f"{count:,}",
169
+ 'Percentage': f"{percentage:.1f}%"
170
+ })
171
+
172
+ stats_df = pd.DataFrame(category_stats)
173
+ st.dataframe(stats_df, width='stretch', hide_index=True)
174
+
175
+ # Overall statistics
176
+ st.markdown(
177
+ f"""
178
+ <h3 style="margin-bottom: -15px;">{lucide_icon('trending-up', size=20)} Overall Statistics
179
+ </h3>
180
+ """,
181
+ unsafe_allow_html=True
182
+ )
183
+ overall_stats = {
184
+ 'Metric': ['Total Reviews', 'Mean Votes', 'Median Votes', 'Std Dev', 'Max Votes'],
185
+ 'Value': [
186
+ f"{df['helpful_vote'].count():,}",
187
+ f"{df['helpful_vote'].mean():.2f}",
188
+ f"{df['helpful_vote'].median():.2f}",
189
+ f"{df['helpful_vote'].std():.2f}",
190
+ f"{df['helpful_vote'].max():,}"
191
+ ]
192
+ }
193
+ overall_df = pd.DataFrame(overall_stats)
194
+ st.dataframe(overall_df, width='stretch', hide_index=True)
195
+
196
+ st.markdown("\n")
197
+
198
+ # Row 3: Advanced Analysis
199
+ st.markdown(
200
+ f"""
201
+ <h3 style="margin-bottom: -10px;>{lucide_icon('search', size=20)} Advanced Target Variable Analysis
202
+ </h3>
203
+ """,
204
+ unsafe_allow_html=True
205
+ )
206
+
207
+ col1, col2 = st.columns(2)
208
+
209
+ with col1:
210
+ # Box plot by category
211
+ fig_box = px.box(
212
+ df_with_categories,
213
+ x='helpfulness_category',
214
+ y='helpful_vote',
215
+ title='<b>Helpful Votes Distribution by Category</b>',
216
+ color_discrete_sequence=['#184A90']
217
+ )
218
+ fig_box.update_layout(
219
+ xaxis_title="Helpfulness Category",
220
+ yaxis_title="Number of Helpful Votes",
221
+ showlegend=False
222
+ )
223
+ st.plotly_chart(fig_box, use_container_width=True)
224
+
225
+ with col2:
226
+ # Cumulative distribution
227
+ sorted_votes = np.sort(df['helpful_vote'])
228
+ cumulative_pct = np.arange(1, len(sorted_votes) + 1) / len(sorted_votes) * 100
229
+
230
+ fig_cumulative = go.Figure()
231
+ fig_cumulative.add_trace(go.Scatter(
232
+ x=sorted_votes,
233
+ y=cumulative_pct,
234
+ mode='lines',
235
+ name='Cumulative %',
236
+ line=dict(color='#2c3e50', width=2)
237
+ ))
238
+
239
+ fig_cumulative.update_layout(
240
+ title='<b>Cumulative Distribution of Helpful Votes</b>',
241
+ xaxis_title='Number of Helpful Votes',
242
+ yaxis_title='Cumulative Percentage (%)',
243
+ showlegend=False
244
+ )
245
+ st.plotly_chart(fig_cumulative, use_container_width=True)
246
+
247
+ # Value counts for helpful votes (moved to bottom)
248
+ st.markdown("\n")
249
+
250
+ st.markdown(
251
+ f"""
252
+ <h3 style="margin-bottom: -10px;"{lucide_icon('hash', size=20)} Detailed Helpful Votes Distribution (Top 20)
253
+ </h3>
254
+ """,
255
+ unsafe_allow_html=True
256
+ )
257
+ # Filter out 0 votes and get top 20
258
+ value_counts = df[df['helpful_vote'] > 0]['helpful_vote'].value_counts().head(20).reset_index()
259
+ value_counts.columns = ['Helpful Votes', 'Count']
260
+
261
+ fig_counts = px.bar(
262
+ value_counts,
263
+ x='Helpful Votes',
264
+ y='Count',
265
+ title='<b>Top 20 Most Common Helpful Vote Counts (Excluding 0)</b>',
266
+ color_discrete_sequence=['#184A90'],
267
+ text='Count'
268
+ )
269
+ fig_counts.update_traces(texttemplate='%{text}', textposition='outside')
270
+ fig_counts.update_layout(
271
+ xaxis_title="Number of Helpful Votes",
272
+ yaxis_title="Frequency"
273
+ )
274
+ st.plotly_chart(fig_counts, use_container_width=True)
275
+
276
+ else:
277
+ st.warning(
278
+ f"{lucide_icon('alert-triangle', size=16)} Target variable 'helpful_vote' not found in the dataset.",
279
+ #unsafe_allow_html=True
280
+ )
281
+
tabs/main_page_tabs/text_analysis.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import plotly.express as px
5
+ import plotly.graph_objects as go
6
+ from utils.icons import lucide_icon
7
+ from wordcloud import WordCloud
8
+ import textstat
9
+
10
+ @st.cache_data(show_spinner='Processing Text Analysis...', show_time=True)
11
+ def get_text_analysis_data(_df, text_col):
12
+ """
13
+ Caches both the wordcloud image and the text statistics.
14
+ The underscore '_' in _df prevents Streamlit from hashing the entire dataframe.
15
+ """
16
+ # Clean the text once
17
+ text_series = _df[text_col].dropna().astype(str)
18
+ full_text = " ".join(text_series)
19
+
20
+ # Generate WordCloud
21
+ wc = WordCloud(
22
+ width=600,
23
+ height=300,
24
+ background_color='white'
25
+ ).generate(full_text)
26
+
27
+ # Calculate textstat metrics (expensive operations)
28
+ stats = {
29
+ "avg_len": text_series.str.len().mean(),
30
+ "fk_grade": text_series.apply(textstat.flesch_kincaid_grade).mean(), # type: ignore
31
+ "s_count": text_series.apply(textstat.syllable_count).mean(), # type: ignore
32
+ "l_count": text_series.apply(textstat.lexicon_count).mean(), # type: ignore
33
+ "wc_image": wc.to_array()
34
+ }
35
+
36
+ return stats
37
+
38
+ def render(df):
39
+ st.markdown(
40
+ f'<h2 class="section-header icon-header">{lucide_icon("bar-chart-2", size=28)} Text Analysis</h2>',
41
+ unsafe_allow_html=True
42
+ )
43
+
44
+ text_col = st.selectbox(
45
+ "Select a text column",
46
+ options=df.columns,
47
+ index=None,
48
+ placeholder="--Select Text Column--"
49
+ )
50
+
51
+ if not text_col:
52
+ st.stop()
53
+
54
+ # Retrieve all cached results at once
55
+ analysis = get_text_analysis_data(df, text_col)
56
+
57
+ # Layout Metrics
58
+ col_a, col_b, col_c, col_d = st.columns(4)
59
+
60
+ col_a.metric(f'Avg {text_col.title()} Length', f"{analysis['avg_len']:.2f}")
61
+ col_b.metric('Avg Grade Level', f"{analysis['fk_grade']:.0f}")
62
+ col_c.metric('Avg Syllable Count', f"{analysis['s_count']:.2f}")
63
+ col_d.metric('Avg Word Count', f"{analysis['l_count']:.2f}")
64
+
65
+ st.markdown("\n")
66
+
67
+ # Display cached WordCloud image
68
+ st.image(analysis['wc_image'], width="content", caption="Image courtesy of you")
69
+
70
+ st.markdown("\n")
71
+
72
+ top_10 = df[text_col].value_counts().nlargest(10).reset_index()
73
+ top_10.columns = [text_col, 'count']
74
+
75
+ cat_fig = px.bar(
76
+ top_10,
77
+ x='count',
78
+ y=text_col,
79
+ orientation='h',
80
+ title=f'<b>Top 10 Categories for {text_col.title()}</b>',
81
+ color_discrete_sequence=['#184A90'],
82
+ text='count'
83
+ )
84
+
85
+ cat_fig.update_layout(
86
+ margin=dict(l=100, r=20, t=50, b=20),
87
+ yaxis={'categoryorder':'total ascending', 'title': ''},
88
+ xaxis_title='Count',
89
+ autosize=True
90
+ )
91
+
92
+ cat_fig.update_traces(texttemplate='%{text}', textposition='outside')
93
+
94
+ # The "Modern" way to call it:
95
+ st.plotly_chart(
96
+ cat_fig,
97
+ use_container_width=True,
98
+ config={'displayModeBar': False} # Example of using the 'config' dict
99
+ )
100
+
tabs/main_page_tabs/univariate_analysis.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Univariate Analysis Tab
3
+ This tab displays distributions of individual variables:
4
+ - Numerical variables: Histograms in a grid layout
5
+ - Categorical variables: Bar charts for each category
6
+ """
7
+
8
+ import streamlit as st
9
+ import plotly.express as px
10
+ import plotly.graph_objects as go
11
+ from utils.icons import lucide_icon
12
+
13
+
14
+ def render(df):
15
+ """
16
+ Render the Univariate Analysis tab.
17
+
18
+ Args:
19
+ df (pd.DataFrame): The main dataset
20
+ """
21
+ st.markdown(
22
+ f'<h2 class="section-header icon-header">{lucide_icon("bar-chart-2", size=28)} Univariate Analysis</h2>',
23
+ unsafe_allow_html=True
24
+ )
25
+
26
+ with st.form('Univariate Form'):
27
+
28
+ col1, col2 = st.columns(2)
29
+
30
+ col1.subheader('Select Column')
31
+ explore_column = col1.selectbox(
32
+ "Choose a column",
33
+ list(df.columns),
34
+ index=None,
35
+ key='explore_column',
36
+ placeholder="-- Select to choose --"
37
+ )
38
+
39
+ col2.subheader('Select Plot Type')
40
+ column_plot = col2.selectbox(
41
+ "Choose a plot type",
42
+ ['Histogram', "Bar", "Box"],
43
+ index=None,
44
+ placeholder="-- Select to choose --"
45
+ )
46
+
47
+ submitted = st.form_submit_button('Plot Away')
48
+
49
+ # Histogram
50
+ if column_plot == "Histogram":
51
+
52
+ fig_hist = go.Figure()
53
+
54
+ upper_limit = df[explore_column].quantile(0.99)
55
+ df_filtered = df[df[explore_column] <= upper_limit]
56
+ num_outliers = (df[explore_column] > upper_limit).sum()
57
+ outlier_percent = (num_outliers / len(df)) / 100
58
+
59
+ fig_hist.add_trace(
60
+ go.Histogram(
61
+ x=df_filtered[explore_column],
62
+ nbinsx=30,
63
+ name=explore_column,
64
+ showlegend=False,
65
+ marker = dict(
66
+ color='#184A90',
67
+ line=dict(color='white')
68
+ )
69
+ ),
70
+ )
71
+
72
+ fig_hist.add_annotation(
73
+ xref="paper", yref="paper", # position relative to the figure
74
+ x=0.98, y=1.05,
75
+ text="ℹ️ Outlier Info",
76
+ showarrow=False,
77
+ font=dict(size=12, color="gray"),
78
+ hovertext="Outliers are filtered above the 99th percentile.<br><br>"f"Threshold: {upper_limit:.2f}<br>"f"Number: {num_outliers}<br>"f"Percent of Data: {outlier_percent:.2f}%",
79
+ hoverlabel=dict(bgcolor="white"),
80
+ )
81
+
82
+ fig_hist.update_layout(
83
+ title_text=f"<b>Distributions of {explore_column}</b>",
84
+ height=400,
85
+ showlegend=False
86
+ )
87
+ st.plotly_chart(fig_hist, use_container_width=True)
88
+
89
+ # Bar chat
90
+ if column_plot == "Bar":
91
+
92
+ if df[explore_column].nunique() > 10:
93
+ # Show top 10 categories for variables with many categories
94
+ top_10 = df[explore_column].value_counts().nlargest(10).reset_index()
95
+ top_10.columns = [explore_column, 'count']
96
+
97
+ cat_fig = px.bar(
98
+ top_10,
99
+ x='count',
100
+ y=explore_column,
101
+ orientation='h',
102
+ title=f'<b>Top 10 Categories for {explore_column}</b>',
103
+ color_discrete_sequence=['#184A90'],
104
+ text='count'
105
+ )
106
+ cat_fig.update_layout(
107
+ margin=dict(l=250, r=20, t=50, b=20),
108
+ yaxis={'categoryorder':'total ascending', 'title': ''},
109
+ xaxis_title='Count'
110
+ )
111
+ cat_fig.update_traces(texttemplate='%{text}', textposition='outside')
112
+ else:
113
+ # Regular histogram for variables with few categories
114
+ cat_fig = px.histogram(
115
+ df,
116
+ x=explore_column,
117
+ title=f'<b>Distribution of {explore_column}</b>',
118
+ color_discrete_sequence=['#184A90']
119
+ )
120
+ cat_fig.update_xaxes(categoryorder="total descending")
121
+
122
+ st.plotly_chart(cat_fig, use_container_width=True)
123
+
124
+ # Box Plot
125
+ if column_plot == "Box":
126
+ box_fig = px.box(
127
+ df,
128
+ x=explore_column,
129
+ orientation='v',
130
+ title=f'<b>Box plot for {explore_column}</b>'
131
+ )
132
+
133
+ st.plotly_chart(box_fig, use_container_width=True)
134
+
tabs/predictive_model_tabs/__init__.py ADDED
File without changes
tabs/predictive_model_tabs/pred_model_one.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ # include expected schema for uploaded data
4
+
5
+ def render():
6
+
7
+ st.markdown("""
8
+ <style>
9
+ @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;600&display=swap');
10
+ .custom-section {
11
+ font-family: 'Inter', sans-serif;
12
+ font-size: 16px;
13
+ line-height: 1.4;
14
+ }
15
+ .custom-section strong {
16
+ font-weight: 600;
17
+ color: #2E86AB;
18
+ display: inline-block;
19
+ margin-bottom: 8px;
20
+ }
21
+ .custom-section ol,
22
+ .custom-section ul {
23
+ margin-top: 0 !important;
24
+ margin-bottom: 10px !important;
25
+ padding-top: !important;
26
+ }
27
+ </style>
28
+ """, unsafe_allow_html=True)
29
+
30
+ st.markdown("## 🤖 About Predictive Modeling")
31
+
32
+ st.markdown("""
33
+ <div class="custom-section">
34
+ <strong>Project Goal</strong><br>
35
+ Our objective was to build a model that predicts which customer reviews will be found helpful by others. By identifying helpful negative reviews, we can surface potential product or service issues worth investigating. Similarly, elevating helpful positive reviews highlights what customers value most.
36
+ <br><br>
37
+
38
+ <strong>Secondary Benefit</strong><br>
39
+ Understanding the characteristics of helpful reviews enables reviewers to improve the quality of their feedback, making it more valuable for both businesses and consumers.
40
+
41
+ <strong>What You'll Find Below</strong>
42
+ <ul>
43
+ <li><strong>Data Schema</strong> — Required format and fields needed to run predictions</li>
44
+ <li><strong>Modeling Process</strong> — Step-by-step explanation of how the model works</li>
45
+ <li><strong>Interactive Demo (Tab 2)</strong> — Hands-on walkthrough before applying the model to your own data on the User Page</li>
46
+ </ul>
47
+ </div>
48
+ """, unsafe_allow_html=True)
49
+
50
+ #st.divider()
51
+ st.markdown("""
52
+ <hr style='
53
+ border: none;
54
+ height: 2px;
55
+ background: linear-gradient(to right, #2E86AB, #87ceeb, #2E86AB);
56
+ margin: 20px 0;
57
+ '>
58
+ """, unsafe_allow_html=True)
59
+
60
+ st.markdown("## Data Schema")
61
+
62
+ st.markdown("""
63
+
64
+ | Column Name | Data Type | Description |
65
+ |------------|-----------|-------------|
66
+ | `lemma_title` | string | Lemmatized version of the review title |
67
+ | `lemma_text` | string | Lemmatized version of the review text |
68
+ | `images` | boolen | Binary indicator if the review includes an image or not |
69
+ | `Review Length` | integer | Character count of the review text |
70
+ | `Title Length` | integer | Character count of the review title |
71
+
72
+ *Read more about lemmatization and the process used in our models [here](https://www.geeksforgeeks.org/python/python-lemmatization-with-nltk/)*
73
+ """)
74
+
75
+ st.markdown("""
76
+ <hr style='
77
+ border: none;
78
+ height: 2px;
79
+ background: linear-gradient(to right, #2E86AB, #87ceeb, #2E86AB);
80
+ margin: 20px 0;
81
+ '>
82
+ """, unsafe_allow_html=True)
83
+
84
+ st.markdown("## Model Components")
85
+
86
+ st.markdown("""
87
+ <div class="custom-section">
88
+ Our model uses a <strong>four-stage pipeline</strong> to predict review helpfulness. We trained on 60,000+ reviews
89
+ and achieved <strong>71.7% accuracy</strong> and <strong>63.6% F1-macro score</strong>. The model outputs probability scores,
90
+ allowing you to rank and prioritize reviews that are most likely to be found helpful by customers.
91
+ </div><br>
92
+ """, unsafe_allow_html=True)
93
+
94
+ st.markdown("### The Pipeline")
95
+
96
+ st.markdown("""
97
+ <div class="custom-section">
98
+
99
+ <strong>1. TF-IDF Vectorization</strong> — Extracting meaningful text patterns<br>
100
+ We transform `lemma_title` and `lemma_text` into numerical features using Term Frequency-Inverse Document Frequency (TF-IDF).
101
+ This approach identifies words and phrases that distinguish helpful reviews from unhelpful ones by balancing how often a term
102
+ appears in a specific review against how common it is across all reviews. Words that appear frequently in helpful reviews but
103
+ rarely elsewhere receive higher weights, making them strong predictive signals.
104
+ <br>
105
+
106
+ <strong>Why TF-IDF?</strong> It automatically downweights generic words while highlighting distinctive language patterns.
107
+ We use 1-2 word phrases (bigrams) to capture meaningful combinations like "works great" or "poor quality."
108
+ <br><br>
109
+
110
+ <strong>2. Standard Scaler</strong> — Normalizing review metrics<br>
111
+ Review length and title length are scaled to have mean=0 and standard deviation=1. This prevents longer reviews from
112
+ dominating the model simply due to scale differences.
113
+ <br>
114
+
115
+ <strong>Known limitation:</strong> We discovered that helpfulness has a non-linear relationship with length. Very short and
116
+ very long reviews both tend to receive fewer helpful votes, with medium-length reviews performing best. Our linear scaling
117
+ doesn't fully capture this relationship, suggesting polynomial features or binning could improve future iterations.
118
+ <br><br>
119
+
120
+ <strong>3. Truncated SVD</strong> — Dimensionality reduction for efficiency<br>
121
+ After TF-IDF, our feature space explodes to 200,000+ dimensions (one for each unique word/phrase). We use Truncated Singular
122
+ Value Decomposition to compress this down to <strong>800 components</strong> while retaining <strong>70% of the variance</strong>.
123
+ This dramatically speeds up training while maintaining predictive power.
124
+ <br>
125
+
126
+ <strong>Why Truncated SVD over PCA?</strong> It works directly with sparse matrices (TF-IDF produces mostly zeros), making it
127
+ far more memory-efficient. We tuned the component count by balancing F1-macro score against model complexity.
128
+ <br><br>
129
+
130
+ <strong>4. Stochastic Gradient Descent Classifier (SGDC)</strong> — The final predictor<br>
131
+ We compared five models: Decision Trees, K-Nearest Neighbors, Linear SVM, XGBoost, and SGDC. <strong>SGDC emerged as the best
132
+ overall performer,</strong>narrowly beating XGBoost on the gains curve (a metric measuring how well the model prioritizes truly
133
+ helpful reviews at the top of its predictions).
134
+ <br><br>
135
+
136
+ <strong>Key tuning decisions:</strong>
137
+ <ul>
138
+ <li><strong>class_weight='balanced'</strong>: Our data is imbalanced (80% of reviews have zero helpful votes), so we weighted
139
+ the minority class to prevent the model from simply predicting "not helpful" for everything</li>
140
+ <li><strong>loss='modified_huber'</strong>: Provides probability estimates (needed for ranking) while being robust to outliers</li>
141
+ <li><strong>early_stopping=True</strong>: Prevents overfitting by monitoring validation performance</li>
142
+ </ul>
143
+
144
+ <strong>Why SGDC over XGBoost?</strong> While XGBoost had slightly better raw accuracy (72% vs 71.7%), SGDC showed better
145
+ generalization, faster training, and superior performance on the gains curve, meaning it does a better job surfacing the
146
+ <em>most</em> helpful reviews, which is what matters for practical use.
147
+ </div>
148
+ """, unsafe_allow_html=True)
149
+
150
+ st.markdown("### Model Performance & Insights")
151
+
152
+ st.markdown("""
153
+ <div class="custom-section">
154
+ <strong>What makes a review helpful?</strong> Our analysis revealed three key patterns:
155
+ <ul>
156
+ <li><strong>Including an image</strong> significantly increases helpfulness</li>
157
+ <li><strong>Medium-length reviews</strong> (not too short, not too long) perform best</li>
158
+ <li><strong>Specific vocabulary</strong> varies by product category — suggesting category-specific models could further improve accuracy</li>
159
+ </ul>
160
+
161
+ <strong>Practical application:</strong> The model outputs probability scores (0-1) that allow you to rank your reviews.
162
+ Focus on high-probability <strong>negative</strong> reviews to identify product issues early, and elevate high-probability
163
+ <strong>positive</strong> reviews to guide purchasing decisions.
164
+ </div>
165
+ """, unsafe_allow_html=True)
166
+
167
+ st.markdown("### Limitations & Future Improvements")
168
+
169
+ st.markdown("""
170
+ <div class="custom-section">
171
+ <strong>Current limitations to be aware of:</strong>
172
+ <ul>
173
+ <li><strong>Category-agnostic training</strong> — The model was trained across all product categories. Category-specific models
174
+ would likely improve accuracy since "helpful" looks different for electronics vs. beauty products</li>
175
+ <li><strong>Low helpfulness threshold</strong> — We defined "helpful" as 1+ votes due to computational constraints. A higher
176
+ threshold (e.g., 5+ votes) would be more meaningful but requires training on larger datasets</li>
177
+ <li><strong>Non-linear length relationships</strong> — As mentioned above, polynomial features could better capture the
178
+ sweet spot for review length</li>
179
+ </ul>
180
+
181
+ <strong>What we'd do with more resources:</strong> Train separate models per category, use a higher helpfulness threshold,
182
+ experiment with transformer-based models (BERT, etc.), and incorporate temporal features (how quickly reviews receive votes).
183
+ </div>
184
+ """, unsafe_allow_html=True)
185
+
186
+ st.markdown("""
187
+ Below you'll find the specific hyperparameters tuned using [Optuna](https://optuna.readthedocs.io/en/stable/index.html),
188
+ an automated hyperparameter optimization framework. Click each section to see the final parameter values and learn more
189
+ about the methods used.
190
+ """)
191
+
192
+ pre, pred = st.columns(2)
193
+
194
+ with pre.expander("**Preprocessing Steps**"):
195
+ col1, col2, col3, col4 = st.columns(4)
196
+
197
+ with col1.popover("TF-IDF Title"):
198
+ st.write("Learn more about tf-idf [here](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)")
199
+ st.code("""
200
+ {'max_df': 0.95,
201
+ 'min_df': 1,
202
+ 'ngram_range': (1, 2),
203
+ 'stop_words': 'english',
204
+ 'sublinear_tf': True}
205
+ """)
206
+
207
+ with col2.popover("TF-IDF Text"):
208
+ st.write("""The sci-kit native english stop words argument was used here for convenience, however there are [known issues](https://scikit-learn.org/stable/modules/feature_extraction.html#stop-words)
209
+ so a future iteration of this project might find improvement is using a more robust selection of stop words and including ones that
210
+ are custom to the specific domain being modeled""")
211
+ st.code("""
212
+ {'max_df': 0.9,
213
+ 'min_df': 2,
214
+ 'ngram_range': (1, 2),
215
+ 'stop_words': 'english',
216
+ 'sublinear_tf': True}
217
+ """)
218
+
219
+ with col3.popover("Standard Scaler"):
220
+ st.write("Default settings for [Standard Scaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html) were used to scale review length and title length")
221
+
222
+ with col4.popover("Truncated SVD"):
223
+ st.write("The only parameter changed was `n_components`. Value used was 800. [Truncated SVD](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html)")
224
+
225
+ with pred.expander("**Predictive Model**"):
226
+ st.write("Model used was Stochastic Gradient Descent Classifier [(SGDC)](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html)")
227
+ st.code("""
228
+ {'alpha': 0.0002,
229
+ 'class_weight': 'balanced',
230
+ 'early_stopping': True,
231
+ 'eta0': 0.001,
232
+ 'l1_ratio': 0.9,
233
+ 'learning_rate': 'adaptive',
234
+ 'loss': 'modified_huber',
235
+ 'max_iter': 500,
236
+ 'n_iter_no_change': 8,
237
+ 'penalty': 'elasticnet',
238
+ 'validation_fraction': 0.15}
239
+ """)
240
+ st.write("Most important parameters were the loss function, `class_weight` and `early_stopping`. Every other parameter tuned lead to marginal improvements")
241
+
242
+
243
+
244
+
tabs/predictive_model_tabs/pred_model_two.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from utils.load_data import load_dataset
4
+ from utils.plot_gains import plot_gains
5
+ from utils.load_pred_model import load_model, load_demo_data
6
+
7
+ def render():
8
+
9
+ model = load_model() # Using the cached function
10
+ df = load_demo_data()
11
+ model_columns = ['lemma_title', 'lemma_text', 'images', 'Review Length', 'Title Length']
12
+ X = df[model_columns]
13
+ y = df['vote']
14
+ st.session_state.demo_probs = model.predict_proba(X)
15
+
16
+ with st.expander("Step 1: Load Data and review", icon="🧐"):
17
+
18
+ st.write("You'll first upload your dataset (in CSV or Parquet format) and review it to make sure everything is guchi. Below is what the final dataset looks like. We'll take care of the preprocessing steps, the only columns you need to ensure exist in the uploaded data are the review title and text columns, images and the number of votes that review has so far if using existing data")
19
+ st.dataframe(df)
20
+
21
+ with st.expander("Step 2: Get Predictions", icon=':material/self_improvement:'):
22
+
23
+ st.write("The next thing we'll do is use the model to make predictions on your data. " \
24
+ "For our purposes, we are predicting the probability that the review belongs to the positive class")
25
+
26
+ prob_button = st.button("Push to predict", icon='🎆', type='secondary')
27
+
28
+ if prob_button:
29
+ prob_df = pd.DataFrame({
30
+ "Actual": y,
31
+ "Probability of helpful vote": st.session_state.demo_probs[:, 1]
32
+ })
33
+ st.dataframe(prob_df)
34
+
35
+ with st.expander("Step 3: Plots the gains", icon=":material/data_thresholding:"):
36
+
37
+ st.write("Once we have our predictions, we can plot the gains curve which shows us the subset of our data is worth focusing on")
38
+
39
+ if st.button("Plot the gains", icon="🤪", type="secondary"):
40
+ fig, data, total = plot_gains(y, st.session_state.demo_probs[:, 1])
41
+ st.plotly_chart(fig)
42
+
43
+ st.write(f"""We can see from this plot that our best return comes from focusing on the top **{round(total*100, 2)}%** of our customers,
44
+ which will lead to us capturing **{round(data*100, 2)}%** of all possible possible cases""")
tabs/user_page_tabs/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from .user_about import render as render_about
2
+ from .user_chat_analysis import render as render_analysis
3
+ from .user_pred_model import render as render_pred
4
+ from .user_topic_model import render as render_topic
5
+
6
+ __all__ = ['render_about', 'render_analysis', 'render_pred', 'render_topic']
tabs/user_page_tabs/user_about.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ def render():
4
+ st.markdown("""
5
+ <style>
6
+ @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;600&display=swap');
7
+ .custom-section {
8
+ font-family: 'Inter', sans-serif;
9
+ font-size: 16px;
10
+ line-height: 1.4;
11
+ }
12
+ .custom-section strong {
13
+ font-weight: 600;
14
+ color: #2E86AB;
15
+ display: inline-block;
16
+ margin-bottom: 8px;
17
+ }
18
+ .custom-section ol,
19
+ .custom-section ul {
20
+ margin-top: 0 !important;
21
+ margin-bottom: 10px !important;
22
+ padding-top: 0 !important;
23
+ }
24
+ </style>
25
+ """, unsafe_allow_html=True)
26
+
27
+ st.markdown("## 👤 Welcome to Your Workspace")
28
+
29
+ st.markdown("""
30
+ <div class="custom-section">
31
+ This is your personal analysis environment where you can upload your own customer review data, discover topics
32
+ within your reviews, predict which reviews will be most helpful, and chat with an AI assistant to interpret
33
+ your results.
34
+ </div>
35
+ """, unsafe_allow_html=True)
36
+
37
+ st.markdown("""
38
+ <hr style='
39
+ border: none;
40
+ height: 2px;
41
+ background: linear-gradient(to right, #2E86AB, #87ceeb, #2E86AB);
42
+ margin: 20px 0;
43
+ '>
44
+ """, unsafe_allow_html=True)
45
+
46
+ st.markdown("## 📊 Required Data Format")
47
+
48
+ st.markdown("""
49
+ <div class="custom-section">
50
+ Your dataset must include the following columns to use all features of this workspace.
51
+ </div><br>
52
+ """, unsafe_allow_html=True)
53
+
54
+ st.markdown("""
55
+ | Column Name | Data Type | Description | Required For |
56
+ |------------|-----------|-------------|--------------|
57
+ | `review_title` or `title` | string | The title of the customer review | Topic Modeling, Predictive Model |
58
+ | `text` | string | The full text content of the review | Topic Modeling, Predictive Model |
59
+ | `images` | boolean/integer | Binary indicator (0/1 or True/False) if the review includes images | Predictive Model |
60
+ | `helpful_vote` | integer | Number of helpful votes the review received | Predictive Model (target variable) |
61
+ | `rating` | integer (1-5) | Star rating given by the reviewer | Optional filtering |
62
+
63
+ **Important notes:**
64
+ - Text columns can contain raw review text (HTML tags will be automatically removed)
65
+ - For best results, include at least 1,000+ reviews
66
+ - The `helpful_vote` column is used as the target variable (0 = not helpful, 1+ = helpful)
67
+ """)
68
+
69
+ st.markdown("""
70
+ <hr style='
71
+ border: none;
72
+ height: 2px;
73
+ background: linear-gradient(to right, #2E86AB, #87ceeb, #2E86AB);
74
+ margin: 20px 0;
75
+ '>
76
+ """, unsafe_allow_html=True)
77
+
78
+ st.markdown("## 🗺️ How to Use This Workspace")
79
+
80
+ st.markdown("""
81
+ <div class="custom-section">
82
+ <strong>Recommended Workflow:</strong><br>
83
+ Follow these steps to get the most value from your analysis:
84
+ </div><br>
85
+ """, unsafe_allow_html=True)
86
+
87
+ st.markdown("""
88
+ <div class="custom-section">
89
+
90
+ <strong>1️⃣ Topic Modeling Tab</strong><br>
91
+ <strong>Purpose:</strong> Discover the main themes and topics discussed in your reviews
92
+
93
+ <strong>What you'll do:</strong>
94
+ <ul>
95
+ <li>Upload your dataset (CSV or Parquet format)</li>
96
+ <li>Select which text columns to analyze (review title, text, or both)</li>
97
+ <li>Configure topic modeling parameters (number of topics, n-gram range)</li>
98
+ <li>Optional: Filter by rating or helpful votes</li>
99
+ <li>Run the NMF topic model and visualize results</li>
100
+ </ul><br>
101
+
102
+ <strong>What you'll learn:</strong> The model extracts the most important words/phrases for each topic,
103
+ helping you understand what customers are talking about. You can see which topics dominate your reviews
104
+ and how they differ across ratings.
105
+ <br><br>
106
+
107
+ <strong>2️⃣ Predictive Modeling Tab</strong><br>
108
+ <strong>Purpose:</strong> Identify which reviews are most likely to be found helpful by other customers
109
+
110
+ <strong>What you'll do:</strong>
111
+ <ul>
112
+ <li>Choose to reuse data from Topic tab or upload new data</li>
113
+ <li>Click "Prep text & features" to process your reviews (lemmatization, feature engineering)</li>
114
+ <li>Run the prediction model to generate helpfulness probability scores</li>
115
+ <li>View predictions and explore the gains curve</li>
116
+ </ul>
117
+
118
+ <strong>What you'll learn:</strong> The model assigns each review a probability score (0-1) indicating
119
+ how likely it is to receive helpful votes. Use this to:
120
+ <ul>
121
+ <li>Prioritize which negative reviews to investigate for product issues</li>
122
+ <li>Identify positive reviews worth promoting</li>
123
+ <li>Understand what characteristics make reviews helpful in your domain</li>
124
+ </ul>
125
+
126
+ <strong>Pro tip:</strong> The gains curve shows how efficiently the model identifies helpful reviews.
127
+ If you can find 70% of helpful reviews by only reading the top 20% ranked by the model, that's significant time savings!
128
+ <br><br>
129
+
130
+ <strong>3️⃣ Chat Analysis Tab</strong><br>
131
+ <strong>Purpose:</strong> Get AI-powered insights and interpretations of your results
132
+
133
+ <strong>What you'll do:</strong>
134
+ <ul>
135
+ <li>Enter your OpenAI API key in the sidebar</li>
136
+ <li>Ask questions about your topic modeling results</li>
137
+ <li>Get help interpreting model performance metrics</li>
138
+ <li>Discuss what actions to take based on your findings</li>
139
+ </ul><br>
140
+
141
+ <strong>What you'll learn:</strong> The AI assistant has context about your entire analysis (dataset stats,
142
+ topics discovered, model performance) and can help you:
143
+ <ul>
144
+ <li>Name and interpret the topics discovered</li>
145
+ <li>Understand what the model performance metrics mean in practical terms</li>
146
+ <li>Generate actionable recommendations based on your results</li>
147
+ <li>Answer "what if" questions about your data</li>
148
+ </ul><br>
149
+
150
+ <strong>Example questions to ask:</strong>
151
+ <ul>
152
+ <li>"What are the main topics in my reviews and what should I name them?"</li>
153
+ <li>"Which reviews should I prioritize reading first?"</li>
154
+ <li>"What does the gains curve tell me about my model's performance?"</li>
155
+ <li>"Are there patterns in the helpful vs non-helpful reviews?"</li>
156
+ </ul>
157
+ </div>
158
+ """, unsafe_allow_html=True)
159
+
160
+ st.markdown("""
161
+ <hr style='
162
+ border: none;
163
+ height: 2px;
164
+ background: linear-gradient(to right, #2E86AB, #87ceeb, #2E86AB);
165
+ margin: 20px 0;
166
+ '>
167
+ """, unsafe_allow_html=True)
168
+
169
+ st.markdown("## 💡 Best Practices")
170
+
171
+ st.markdown("\n")
172
+
173
+ st.markdown("""
174
+ <div class="custom-section">
175
+ <strong>Data Quality</strong>
176
+ <ul>
177
+ <li>Include at least 1,000 reviews for meaningful topic modeling</li>
178
+ <li>Ensure text fields don't have excessive missing values</li>
179
+ <li>If most reviews have 0 helpful votes, consider filtering to reviews with at least 1 vote for topic modeling</li>
180
+ </ul><br>
181
+
182
+ <strong>Topic Modeling Tips</strong>
183
+ <ul>
184
+ <li>Start with 5-7 topics and adjust based on results</li>
185
+ <li>Use bigrams (1-2 word phrases) for more interpretable topics</li>
186
+ <li>Add domain-specific stopwords in the optional parameters if common words dominate</li>
187
+ <li>Try filtering by rating to see how topics differ between positive and negative reviews</li>
188
+ </ul><br>
189
+
190
+ <strong>Predictive Modeling Tips</strong>
191
+ <ul>
192
+ <li>Review the gains curve to understand model performance — steep initial rise = efficient identification of helpful reviews</li>
193
+ <li>Focus on the probability scores, not just binary predictions</li>
194
+ <li>Sort predictions by probability to create a priority reading list</li>
195
+ <li>Cross-reference with ratings: high-probability negative reviews = priority issues</li>
196
+ </ul><br>
197
+
198
+ <strong>Chat Analysis Tips</strong>
199
+ <ul>
200
+ <li>Be specific with your questions for better answers</li>
201
+ <li>Ask the AI to suggest names for topics based on the top words</li>
202
+ <li>Use it to brainstorm action items based on your findings</li>
203
+ <li>Click "Reset Chat" to start fresh if the conversation gets off track</li>
204
+ </ul>
205
+ </div>
206
+ """, unsafe_allow_html=True)
207
+
208
+ st.markdown("""
209
+ <hr style='
210
+ border: none;
211
+ height: 2px;
212
+ background: linear-gradient(to right, #2E86AB, #87ceeb, #2E86AB);
213
+ margin: 20px 0;
214
+ '>
215
+ """, unsafe_allow_html=True)
216
+
217
+ st.markdown("## 🚀 Ready to Get Started?")
218
+
219
+ st.markdown("""
220
+ <div class="custom-section">
221
+ Head to the <strong>Topic Modeling</strong> tab to upload your data and begin your analysis!
222
+ </div>
223
+ """, unsafe_allow_html=True)
tabs/user_page_tabs/user_chat_analysis.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from openai import OpenAI
3
+ import numpy as np
4
+
5
+ def system_prompt():
6
+ # Helper to find target column
7
+ def find_vote_column(df):
8
+ vote_cols = [col for col in df.columns if 'vote' in col.lower()]
9
+ return vote_cols[0] if vote_cols else None
10
+
11
+ # Helper to get key model params only
12
+ def get_key_params(params_dict):
13
+ key_params = ['alpha', 'class_weight', 'l1_ratio', 'loss', 'max_iter',
14
+ 'penalty', 'validation_fraction', 'learning_rate', 'eta0']
15
+ return {k: v for k, v in params_dict.items() if k in key_params}
16
+
17
+ # Build system prompt
18
+ vote_col = find_vote_column(st.session_state.raw_df) if 'raw_df' in st.session_state else None
19
+ target_dist = ""
20
+ if vote_col and 'raw_df' in st.session_state:
21
+ pos_rate = (st.session_state.raw_df[vote_col] > 0).mean() * 100
22
+ target_dist = f"\n- Target distribution: {pos_rate:.2f}% of reviews have helpful votes"
23
+
24
+ context = f"""You are an expert data science assistant helping users analyze customer review data. Your role is to help users understand and
25
+ interpret their analyses naturally - answer questions conversationally without dumping context unless directly relevant. Start each conversation by asking
26
+ the user where they would like to begin without giving a bunch of information and without providing any starting points. Let the user guide the conversation.
27
+ Your role is to be responsive, not proactive.
28
+
29
+ Below is the information from the customer review data the user uploaded. There is some basic information about the data, the results of topic modeling using NMF, the results
30
+ of the predictive modeling steps, and modeling performance on their data. Use the below information to guide your answers to their questions.
31
+
32
+ ## Dataset Overview
33
+ - Shape: {st.session_state.raw_df.shape[0]:,} reviews × {st.session_state.raw_df.shape[1]} columns
34
+ - Columns: {', '.join(st.session_state.raw_df.columns)}{target_dist}
35
+
36
+ ## Topic Modeling (NMF)
37
+ {"- Status: Not yet run" if not st.session_state.get('topics_fitted') else f'''- Analyzed columns: {', '.join(st.session_state.get('topic_columns', []))}
38
+ - TF-IDF n-grams: {st.session_state.get('topic_ngram')}
39
+ - Discovered: {len(st.session_state.get('top_topics', {}))}
40
+ - Top words and corresponding weights in percentages rounded to 4 digits {st.session_state.get('top_topics', {})}
41
+ - Each word or phase is prepended by the preprocessing step and column followed by two underscores __. Ignore the prefixes and only refer to the words'''}
42
+
43
+ ## Predictive Model (Helpful Vote Prediction)
44
+ {"- Status: Not yet run" if not st.session_state.get('model_run') else f'''- Purpose: Identify reviews likely to receive helpful votes (for elevation/analysis)
45
+ - Architecture: TF-IDF → TruncatedSVD → SGD Classifier
46
+ - Title n-grams: {st.session_state.get('title_ngram', 'N/A')}
47
+ - Text n-grams: {st.session_state.get('text_ngram', 'N/A')}
48
+ - SVD: {st.session_state.get('svd_comps')} components ({st.session_state.get('explained_variance'):.1f}% variance explained)
49
+ - Model config (key params): {get_key_params(st.session_state.get('model_params', {}))}
50
+ - Features used: {', '.join(list(st.session_state.X.columns))}'''}
51
+
52
+ {"" if not st.session_state.get('model_run') or not st.session_state.get('accuracy') else f'''## Model Performance
53
+ - Accuracy: {st.session_state.get('accuracy', 0):.2f}%
54
+ - Precision: {st.session_state.get('precision', 0):.2f}%
55
+ - Recall: {st.session_state.get('recall', 0):.2f}%
56
+ - F1 Score: {st.session_state.get('f1', 0):.2f}%
57
+ - KS statistic: {st.session_state.get('ks_value', 0):.2f}
58
+ - Peak efficiency: Captures {st.session_state.get('peak_gains', 0):.2f}% of all helpful reviews by reviewing just {st.session_state.get('percent_data', 0):.4f}% of data
59
+ - Interpretation: This shows the lift over random selection for identifying valuable reviews'''}
60
+
61
+ ## Guidelines
62
+ - Be conversational and concise - only cite specific numbers when directly relevant
63
+ - Help interpret results in business terms (which reviews to prioritize, what topics matter)
64
+ - When discussing topics, reference them by their top words, not just numbers. Suggest potential topic names based on the top words for each topic when asked by the user
65
+ - When discussing model performance, focus on practical implications (e.g., "you can find 80% of helpful reviews by only reading the top 20%")
66
+ - Ask clarifying questions when the user's question is ambiguous
67
+ - Suggest analyses only when naturally relevant to the conversation"""
68
+
69
+ return context
70
+
71
+ def render():
72
+
73
+ st.set_page_config(page_title="Chat", layout="centered")
74
+
75
+ # --- Sidebar ---
76
+ openai_api_key = st.sidebar.text_input("OpenAI API Key", type="password")
77
+
78
+ if not openai_api_key:
79
+ st.info("Enter your OpenAI API key in the sidebar to use the chat.")
80
+ st.stop()
81
+
82
+ client = OpenAI(api_key=openai_api_key)
83
+
84
+ if st.sidebar.button("Reset Chat", type = 'primary'):
85
+ st.session_state.messages = []
86
+
87
+
88
+ # --- State ---
89
+ if "messages" not in st.session_state:
90
+ st.session_state.messages = []
91
+
92
+ # --- Put the chat UI ABOVE the input by creating a container first ---
93
+ chat_box = st.container()
94
+
95
+ with chat_box:
96
+ for msg in st.session_state.messages:
97
+ with st.chat_message(msg["role"]):
98
+ st.markdown(msg["content"])
99
+
100
+ # --- Keep input LAST so it stays at the bottom ---
101
+ user_input = st.chat_input("Say something")
102
+ SYSTEM_PROMPT = system_prompt()
103
+
104
+ if user_input:
105
+ # Add user message to state immediately
106
+ st.session_state.messages.append({"role": "user", "content": user_input})
107
+
108
+ # Render the new user message + stream assistant IN THE chat_box (above input)
109
+ with chat_box:
110
+ with st.chat_message("user"):
111
+ st.markdown(user_input)
112
+
113
+ with st.chat_message("assistant"):
114
+ # System prompt goes first (not shown in UI unless you add it to history)
115
+ model_messages = [{"role": "system", "content": SYSTEM_PROMPT}]
116
+ model_messages += [
117
+ {"role": m["role"], "content": m["content"]}
118
+ for m in st.session_state.messages
119
+ ]
120
+
121
+ stream = client.chat.completions.create(
122
+ model="gpt-5-nano",
123
+ messages=model_messages, # type: ignore
124
+ stream=True,
125
+ ) # type: ignore
126
+
127
+ response = st.write_stream(stream)
128
+
129
+ st.session_state.messages.append({"role": "assistant", "content": response})
130
+
131
+ # Optional but recommended: forces the “final” assistant message to appear
132
+ # in the history immediately (and keeps layout stable).
133
+ st.rerun()
134
+
135
+
136
+
137
+
138
+
tabs/user_page_tabs/user_pred_model.py ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import streamlit as st
4
+ import pandas as pd
5
+ import numpy as np
6
+ from utils.load_pred_model import load_model
7
+ from utils.prepare_user_dataframe import prep_text_and_features
8
+ from utils.plot_gains import plot_gains
9
+ from utils.load_user_data import load_uploaded_file
10
+
11
+
12
+ def _initialize_model(model):
13
+ """
14
+ Load the model and set session state variables.
15
+ Only runs when needed.
16
+ """
17
+ st.session_state.svd_comps = model[0][1].n_components
18
+ st.session_state.explained_variance = round(model[0][1].explained_variance_ratio_.sum() * 100, 2)
19
+ st.session_state.title_ngram = model[0][0]['tfidf_title'].ngram_range
20
+ st.session_state.text_ngram = model[0][0]['tfidf_text'].ngram_range
21
+ st.session_state.model_params = model[1].get_params()
22
+ st.session_state.model_loaded = True
23
+
24
+
25
+ # -------------------------------------------------------------------
26
+ # Session state helpers
27
+ # -------------------------------------------------------------------
28
+
29
+ def _init_state() -> None:
30
+ """
31
+ Initialize all session_state keys used in this tab.
32
+ """
33
+ defaults = {
34
+ # Shared across tabs
35
+ "user_base_df": None, # original uploaded df from topic tab
36
+ "user_processed_df": None, # processed df with lemma_* etc.
37
+ "user_raw_df": None, # alias kept for backwards compatibility
38
+ "raw_df": None, # alias kept for backwards compatibility
39
+
40
+ # Prediction-specific
41
+ "prepped_df": None,
42
+ "X": None,
43
+ "true_y": None,
44
+ "probs": None,
45
+ "prep_done": False,
46
+ "model_run": False,
47
+ "active_file_name": None,
48
+ "data_source": None, # "topic_tab" or "upload"
49
+ }
50
+ for key, value in defaults.items():
51
+ if key not in st.session_state:
52
+ st.session_state[key] = value
53
+
54
+
55
+ def _validate_schema(df: pd.DataFrame) -> bool:
56
+ """
57
+ Check that the dataframe has the columns required for the prediction model.
58
+ Allows extra columns; only errors on missing required columns.
59
+ """
60
+ required = {"helpful_vote", "review_title", "text", "images"}
61
+ df_cols = set(df.columns)
62
+ missing = required - df_cols
63
+
64
+ if missing:
65
+ st.error(
66
+ "The uploaded dataset is missing required columns for prediction: "
67
+ + ", ".join(sorted(missing))
68
+ )
69
+ with st.expander("View dataframe columns"):
70
+ st.write(sorted(df.columns.tolist()))
71
+ return False
72
+
73
+ return True
74
+
75
+
76
+ def _set_active_dataframe(df: pd.DataFrame, source: str, file_name: str | None = None) -> None:
77
+ """
78
+ Store the active dataframe in session_state and reset dependent state.
79
+ """
80
+ # For prediction we treat this df as the active working df
81
+ st.session_state.user_raw_df = df
82
+ st.session_state.raw_df = df # legacy alias
83
+ st.session_state.data_source = source
84
+ st.session_state.active_file_name = file_name
85
+
86
+ # Reset downstream state
87
+ st.session_state.prepped_df = None
88
+ st.session_state.X = None
89
+ st.session_state.true_y = None
90
+ st.session_state.probs = None
91
+ st.session_state.prep_done = False
92
+ st.session_state.model_run = False
93
+
94
+
95
+ # -------------------------------------------------------------------
96
+ # Main render function
97
+ # -------------------------------------------------------------------
98
+
99
+ def render() -> None:
100
+ _init_state()
101
+ model = load_model()
102
+ _initialize_model(model)
103
+
104
+ st.header("User Prediction Model")
105
+ st.markdown(
106
+ "Use this tab to run the **helpful-vote prediction model** on your dataset. "
107
+ "You can reuse the dataset from the topic modeling tab or upload a new file."
108
+ )
109
+
110
+ # Prefer the processed df from the topic tab if it exists
111
+ if "user_processed_df" in st.session_state:
112
+ topic_df = st.session_state["user_processed_df"]
113
+ elif "user_raw_df" in st.session_state:
114
+ topic_df = st.session_state["user_raw_df"]
115
+ else:
116
+ topic_df = None
117
+
118
+ has_topic_df = topic_df is not None
119
+
120
+ # -----------------------------
121
+ # Choose data source
122
+ # -----------------------------
123
+ if has_topic_df:
124
+ source = st.radio(
125
+ "Choose data source:",
126
+ ["Use data from Topic Modeling tab", "Upload new dataset"],
127
+ horizontal=True,
128
+ )
129
+ else:
130
+ source = "Upload new dataset"
131
+ st.info(
132
+ "No dataset found from the Topic Modeling tab. "
133
+ "Please upload a dataset to continue."
134
+ )
135
+
136
+ active_df: pd.DataFrame | None = None
137
+
138
+ # -----------------------------
139
+ # Option 1: reuse data from topic tab
140
+ # -----------------------------
141
+ if source == "Use data from Topic Modeling tab":
142
+ if not has_topic_df:
143
+ st.warning("No dataset available from the Topic Modeling tab.")
144
+ st.stop()
145
+
146
+ df = topic_df
147
+ if df is not None and _validate_schema(df):
148
+ # Only reset active dataframe if we weren't already using topic_tab
149
+ if st.session_state.data_source != "topic_tab":
150
+ _set_active_dataframe(df, source="topic_tab", file_name="from_topic_tab")
151
+ # Use the processed/topic df directly
152
+ active_df = df
153
+
154
+ # -----------------------------
155
+ # Option 2: upload one or more files
156
+ # -----------------------------
157
+ else:
158
+ uploaded_files = st.file_uploader(
159
+ "Upload one or more data files (CSV or Parquet)",
160
+ type=["csv", "parquet"],
161
+ key="predictive_data",
162
+ accept_multiple_files=True,
163
+ )
164
+
165
+ chosen_file = None
166
+ if uploaded_files:
167
+ if len(uploaded_files) == 1:
168
+ chosen_file = uploaded_files[0]
169
+ else:
170
+ file_names = [f.name for f in uploaded_files]
171
+ chosen_name = st.selectbox("Select which file to use", file_names)
172
+ chosen_file = next(f for f in uploaded_files if f.name == chosen_name)
173
+
174
+ if chosen_file is not None:
175
+ needs_new = (
176
+ st.session_state.data_source != "upload"
177
+ or st.session_state.active_file_name != chosen_file.name
178
+ )
179
+ if needs_new:
180
+ df = load_uploaded_file(chosen_file)
181
+ if df is not None and _validate_schema(df):
182
+ _set_active_dataframe(
183
+ df, source="upload", file_name=chosen_file.name
184
+ )
185
+ # Use whatever is currently active (may be newly set above)
186
+ active_df = st.session_state.user_raw_df
187
+ elif (
188
+ st.session_state.data_source == "upload"
189
+ and st.session_state.user_raw_df is not None
190
+ ):
191
+ # User uploaded a file earlier; reuse it
192
+ active_df = st.session_state.user_raw_df
193
+
194
+ # If we still don't have an active dataframe, bail out
195
+ if active_df is None and st.session_state.user_raw_df is None:
196
+ st.stop()
197
+
198
+ if active_df is None:
199
+ active_df = st.session_state.user_raw_df
200
+
201
+ # -----------------------------
202
+ # Data preview
203
+ # -----------------------------
204
+ st.markdown("### Active dataset")
205
+
206
+ if st.session_state.active_file_name:
207
+ st.caption(
208
+ f"Using data source: **{st.session_state.data_source}** "
209
+ f"({st.session_state.active_file_name})"
210
+ )
211
+ else:
212
+ st.caption(f"Using data source: **{st.session_state.data_source or 'unknown'}**")
213
+
214
+ with st.expander("Preview first 5 rows"):
215
+ st.dataframe(active_df, width='stretch')
216
+
217
+ # -----------------------------
218
+ # Prep text & features
219
+ # -----------------------------
220
+ st.markdown("### Step 1 – Prepare text and features")
221
+
222
+ prep_col1, prep_col2 = st.columns([1, 3])
223
+ with prep_col1:
224
+ prep_clicked = st.button(
225
+ "Prep text & features",
226
+ type="primary",
227
+ help="Strip HTML, lemmatize text, and create engineered features.",
228
+ )
229
+
230
+ if prep_clicked:
231
+ try:
232
+ # 👇 Pass the *currently active* dataframe. If it has lemma_* columns
233
+ # from the topic tab, prep_text_and_features will reuse them and skip
234
+ # re-lemmatization.
235
+ prep_text_and_features(model, df=active_df)
236
+ except Exception as e:
237
+ st.error(f"Error during preprocessing: {e}")
238
+
239
+ if st.session_state.prep_done:
240
+ with prep_col2:
241
+ st.info(
242
+ "Preprocessing complete. You can now run the model and explore the results."
243
+ )
244
+
245
+ with st.expander("View prepped dataframe (first 5 rows)"):
246
+ st.dataframe(
247
+ st.session_state.prepped_df.head(), width="stretch"
248
+ )
249
+
250
+ # -----------------------------
251
+ # Run model
252
+ # -----------------------------
253
+ st.markdown("### Step 2 – Run prediction model")
254
+
255
+ if not st.session_state.prep_done:
256
+ st.warning("Please run *Prep text & features* before running the model.")
257
+ st.stop()
258
+
259
+ go_ahead = st.button(
260
+ label="Run model?",
261
+ type="primary",
262
+ icon="🔥",
263
+ help="Generate predicted helpful-vote probabilities.",
264
+ )
265
+
266
+ if go_ahead:
267
+ X = st.session_state.X
268
+ st.session_state.probs = model.predict_proba(X)
269
+ st.session_state.model_run = True
270
+ st.success("🔥 Model predictions have been generated.")
271
+
272
+ # -----------------------------
273
+ # Results & gains curve
274
+ # -----------------------------
275
+ st.markdown("### Step 3 – Explore results")
276
+
277
+ if not st.session_state.model_run:
278
+ st.info("Run the model to unlock predictions and the gains curve.")
279
+ return
280
+
281
+ true_y = st.session_state.true_y
282
+ probs = st.session_state.probs
283
+
284
+ show_table = st.checkbox("Show prediction table", value=True)
285
+ show_gains = st.checkbox("Show gains curve", value=True)
286
+
287
+ if show_table:
288
+ compare = pd.DataFrame(
289
+ {
290
+ "True Values": true_y,
291
+ "P(0 – No helpful votes)": probs[:, 0],
292
+ "P(1+ helpful votes)": probs[:, 1],
293
+ }
294
+ )
295
+ st.markdown("#### Prediction probabilities")
296
+ st.dataframe(compare.head(200), width="stretch")
297
+
298
+ if show_gains:
299
+ st.markdown("#### Gains curve")
300
+ fig, data, total = plot_gains(true_y, probs[:, 1])
301
+ st.plotly_chart(fig, width="stretch")
302
+
303
+ if st.session_state.model_run:
304
+ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
305
+
306
+ y_true = st.session_state.true_y
307
+ y_pred = model.predict(st.session_state.X)
308
+ st.session_state.accuracy = round(accuracy_score(y_true, y_pred) * 100, 4)
309
+ st.session_state.precision = np.round(precision_score(y_true, y_pred) * 100, 4)
310
+ st.session_state.recall = np.round(recall_score(y_true, y_pred) * 100, 4)
311
+ st.session_state.f1 = np.round(f1_score(y_true, y_pred) * 100, 4)
312
+
313
+
314
+
315
+
316
+
317
+
318
+
319
+
320
+
321
+
322
+
tabs/user_page_tabs/user_topic_model.py ADDED
@@ -0,0 +1,313 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from typing import List, Optional, Tuple
4
+
5
+ from sklearn.compose import ColumnTransformer
6
+ from sklearn.decomposition import NMF
7
+ from sklearn.feature_extraction.text import TfidfVectorizer
8
+ from sklearn.pipeline import make_pipeline, Pipeline
9
+
10
+ from utils.build_plotly import _build_topic_figure
11
+ from utils.load_user_data import load_uploaded_file
12
+ from utils.prepare_user_dataframe import prep_text_column
13
+
14
+ from nltk.corpus import stopwords # type: ignore
15
+ import plotly.graph_objects as go # type: ignore
16
+ import pandas.api.types as ptypes
17
+
18
+ # Build stopword list (don’t mutate across calls)
19
+ BASE_STOPWORDS = set(stopwords.words("english"))
20
+
21
+ CUSTOM_KEEP = {
22
+ 'not','no','but','ain','don',"don't",'aren',"aren't",'couldn',"couldn't",
23
+ 'didn',"didn't",'doesn',"doesn't",'hadn',"hadn't",'hasn',"hasn't",'haven',
24
+ "haven't",'isn',"isn't",'mightn',"mightn't",'mustn',"mustn't",'needn',
25
+ "needn't",'shan',"shan't",'shouldn',"shouldn't",'wasn',"wasn't",'weren',
26
+ "weren't",'won',"won't",'wouldn',"wouldn't",'very','too'
27
+ }
28
+
29
+ DEFAULT_STOPWORDS = sorted(BASE_STOPWORDS - CUSTOM_KEEP)
30
+
31
+
32
+ @st.cache_data(show_spinner="One moment please!", show_time=True)
33
+ def make_topics(
34
+ df: pd.DataFrame,
35
+ user_title: str,
36
+ topic_columns: List[str],
37
+ n1: int,
38
+ n2: int,
39
+ n_components: int,
40
+ rating: Optional[List[int]] = None,
41
+ helpful_vote: Optional[int] = None,
42
+ new_words: Optional[List[str]] = None,
43
+ n_top_words: int = 5,
44
+ ) -> Tuple[ColumnTransformer | Pipeline, go.Figure]:
45
+ """
46
+ Fit TF-IDF + NMF topic model and return (pipeline, Plotly figure).
47
+ """
48
+
49
+ # Start from the input df (this will usually be st.session_state.user_processed_df)
50
+ base_df = df
51
+
52
+ # Validate selected columns exist
53
+ selected_cols = [col for col in topic_columns if col in base_df.columns]
54
+
55
+ if not selected_cols:
56
+ st.error("No valid columns selected for topic modeling.")
57
+ raise ValueError("No valid columns selected for topic modeling.")
58
+
59
+ # Check for text vs non-text columns
60
+ text_cols: list[str] = []
61
+ non_text_cols: list[str] = []
62
+
63
+ for col in selected_cols:
64
+ col_series = base_df[col]
65
+ if ptypes.is_string_dtype(col_series) or ptypes.is_object_dtype(col_series):
66
+ text_cols.append(col)
67
+ else:
68
+ non_text_cols.append(col)
69
+
70
+ if non_text_cols:
71
+ st.error(
72
+ "The following columns are not text columns and will be dropped "
73
+ f"from topic modeling: {', '.join(non_text_cols)}"
74
+ )
75
+
76
+ if not text_cols:
77
+ st.error("None of the selected columns are text columns. Please select text columns.")
78
+ raise ValueError("No text columns available for topic modeling.")
79
+
80
+ # Build stopword list
81
+ stop_list = DEFAULT_STOPWORDS.copy()
82
+ if new_words:
83
+ stop_list.extend(new_words)
84
+
85
+ # ------------------------------------------------------------------
86
+ # Ensure lemma columns exist on the *processed* dataframe
87
+ # ------------------------------------------------------------------
88
+ lemma_cols: list[str] = []
89
+ transformers = []
90
+
91
+ for col in text_cols:
92
+ if col == "review_title":
93
+ lemma_col = "lemma_title"
94
+ elif col == "text":
95
+ lemma_col = "lemma_text"
96
+ else:
97
+ lemma_col = f"lemma_{col}"
98
+
99
+ base_df = prep_text_column(
100
+ df=base_df,
101
+ text_col=col,
102
+ lemma_col=lemma_col,
103
+ overwrite=False,
104
+ )
105
+ lemma_cols.append(lemma_col)
106
+
107
+ transformers.append(
108
+ (
109
+ f"tfidf_{col}",
110
+ TfidfVectorizer(stop_words=stop_list, ngram_range=(n1, n2)),
111
+ lemma_col,
112
+ )
113
+ )
114
+
115
+ # 🔁 Persist updated processed df back into session_state so the
116
+ # prediction tab can reuse lemma_* columns without re-lemmatizing.
117
+ st.session_state.user_processed_df = base_df
118
+ # Backwards-compat aliases used elsewhere in the app
119
+ st.session_state.raw_df = base_df
120
+ st.session_state.user_raw_df = base_df
121
+
122
+ # ------------------------------------------------------------------
123
+ # Working copy for filtering + modeling
124
+ # ------------------------------------------------------------------
125
+ work_df = base_df.copy()
126
+
127
+ if rating is not None and "rating" in work_df.columns:
128
+ work_df = work_df[work_df["rating"].isin(rating)]
129
+ if helpful_vote is not None and "helpful_vote" in work_df.columns:
130
+ work_df = work_df[work_df["helpful_vote"] > helpful_vote]
131
+
132
+ preprocessor = ColumnTransformer(transformers)
133
+
134
+ nmf = NMF(
135
+ n_components=n_components,
136
+ init="nndsvda",
137
+ solver="mu",
138
+ beta_loss=1,
139
+ random_state=10,
140
+ )
141
+
142
+ topic_pipeline = make_pipeline(preprocessor, nmf)
143
+
144
+ topic_pipeline.fit(work_df[lemma_cols])
145
+
146
+ feature_names = topic_pipeline[0].get_feature_names_out()
147
+ nmf_model: NMF = topic_pipeline[1]
148
+
149
+ fig = _build_topic_figure(
150
+ model=nmf_model,
151
+ feature_names=feature_names,
152
+ n_top_words=n_top_words,
153
+ title=user_title,
154
+ n_components=n_components,
155
+ bar_color="#184A90",
156
+ )
157
+
158
+ return topic_pipeline, fig
159
+
160
+
161
+ def render() -> None:
162
+ st.header("Upload Your Data and Run the Topic Model")
163
+
164
+ # --- STEP 1: Upload & basic review ---
165
+ st.subheader("Step 1: Upload and Review Your Data")
166
+
167
+ uploaded_file = st.file_uploader(
168
+ label="Upload your dataframe (CSV or Parquet)",
169
+ type=["csv", "parquet"],
170
+ key="topic_data",
171
+ label_visibility="collapsed",
172
+ )
173
+
174
+ if uploaded_file is not None:
175
+ file_name = uploaded_file.name
176
+ prev_name = st.session_state.get("topic_file_name")
177
+
178
+ # Only reload if the user picked a *different* file
179
+ if prev_name != file_name:
180
+ df = load_uploaded_file(uploaded_file)
181
+ st.session_state.topic_file_name = file_name
182
+
183
+ # Original upload (never mutated)
184
+ st.session_state.user_base_df = df
185
+
186
+ # Working / processed copy (lemma_* columns etc. get added here)
187
+ processed = df.copy() # type: ignore
188
+ st.session_state.user_processed_df = processed
189
+
190
+ # Backwards-compat aliases used by other parts of the app
191
+ st.session_state.raw_df = processed
192
+ st.session_state.user_raw_df = processed
193
+
194
+ # Prefer the processed dataframe if available
195
+ if "user_processed_df" in st.session_state:
196
+ raw_df = st.session_state["user_processed_df"]
197
+ elif "raw_df" in st.session_state:
198
+ raw_df = st.session_state["raw_df"]
199
+ else:
200
+ raw_df = None
201
+
202
+ if raw_df is None:
203
+ st.info("👆 Upload a dataframe to begin.")
204
+ return
205
+
206
+ st.dataframe(raw_df)
207
+
208
+ user_title = st.text_input(
209
+ "Title for topic plot",
210
+ value="Topics",
211
+ )
212
+
213
+ topic_columns = st.multiselect(
214
+ "Select one or more text columns to topic model",
215
+ options=list(raw_df.columns),
216
+ )
217
+
218
+ col1, col2 = st.columns(spec=2, gap="small")
219
+
220
+ with col1:
221
+ n1 = st.selectbox(
222
+ "First tfidf n-gram length",
223
+ options=[1, 2, 3, 4],
224
+ )
225
+ with col2:
226
+ n1_plus = st.selectbox(
227
+ "Second tfidf n-gram length",
228
+ options=[0, 1, 2],
229
+ help=(
230
+ "This number will add to n1 to create a range of n-gram lengths. "
231
+ "It's not recommended to go beyond an n-gram length of 4"
232
+ ),
233
+ )
234
+
235
+ num_topics = st.slider(
236
+ "Select number of topics to model",
237
+ min_value=1,
238
+ max_value=20,
239
+ value=5,
240
+ step=1,
241
+ help="Any more than 10 isn't usually constructive",
242
+ )
243
+
244
+ with st.expander("Optional parameters"):
245
+ st.write(
246
+ "These are optional parameters that can be selected "
247
+ "to configure your topic model plot"
248
+ )
249
+
250
+ tab1, tab2 = st.tabs(["Filters", "Other"])
251
+
252
+ with tab1:
253
+ st.write(
254
+ "If available, can filter your dataset by the following "
255
+ "columns to topic model by a specific rating or number of votes"
256
+ )
257
+ rating = st.multiselect(
258
+ "Rating equal to ...",
259
+ options=[1, 2, 3, 4, 5],
260
+ )
261
+ help_votes = st.number_input(
262
+ "Votes greater than ... ",
263
+ min_value=0,
264
+ max_value=None,
265
+ value="min",
266
+ step=1,
267
+ )
268
+
269
+ with tab2:
270
+ st.write(
271
+ "If you have domain specific words that are common, "
272
+ "removing them can help make specific topics more clear"
273
+ )
274
+ stop_words = st.multiselect(
275
+ "Optional stop words",
276
+ options=[],
277
+ placeholder="Add words you want removed",
278
+ accept_new_options=True,
279
+ )
280
+
281
+ top_words = st.slider(
282
+ "Number of top words you want displayed for each topic",
283
+ min_value=1,
284
+ max_value=10,
285
+ value=5,
286
+ step=1,
287
+ )
288
+
289
+ if st.button("Run Model", type="primary"):
290
+ if not topic_columns:
291
+ st.error("Please select at least one column for topic modeling.")
292
+ return
293
+
294
+ pipeline, fig = make_topics(
295
+ df=raw_df,
296
+ user_title=user_title,
297
+ topic_columns=topic_columns,
298
+ n1=n1,
299
+ n2=n1 + n1_plus,
300
+ n_components=num_topics,
301
+ rating=rating or None,
302
+ helpful_vote=help_votes or None,
303
+ new_words=stop_words,
304
+ n_top_words=top_words,
305
+ )
306
+
307
+ st.plotly_chart(fig)
308
+
309
+ st.session_state.topic_ngram = (n1, n1 + n1_plus)
310
+ st.session_state.columns = list(raw_df.columns)
311
+ st.session_state.topic_columns = topic_columns
312
+ st.session_state.topics_fitted = True
313
+
utils/__init__.py ADDED
File without changes
utils/build_plotly.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import streamlit as st
3
+ import plotly.graph_objects as go # type: ignore
4
+ from plotly.subplots import make_subplots # type: ignore
5
+ from sklearn.decomposition import NMF
6
+
7
+
8
+ # --------- Plot helper (Plotly) ---------
9
+ def _build_topic_figure(
10
+ model: NMF,
11
+ feature_names: np.ndarray,
12
+ n_top_words: int,
13
+ title: str,
14
+ n_components: int,
15
+ bar_color: str
16
+ ) -> go.Figure:
17
+ """Create a Plotly subplot grid of top terms per topic (horizontal bars)."""
18
+ # Layout: up to 2 columns, as many rows as needed
19
+ cols = 2 if n_components > 3 else 1
20
+ rows = int(np.ceil(n_components / cols))
21
+
22
+ fig = make_subplots(
23
+ rows=rows,
24
+ cols=cols,
25
+ subplot_titles=[f"Topic {i+1}" for i in range(n_components)],
26
+ horizontal_spacing=0.25,
27
+ vertical_spacing=0.1
28
+ )
29
+
30
+ top_features_dict = {}
31
+ max_weight = 0
32
+
33
+ for topic_idx, topic in enumerate(model.components_):
34
+ top_features_ind = topic.argsort()[::-1][:n_top_words]
35
+ top_features = feature_names[top_features_ind]
36
+ weights = topic[top_features_ind] / np.sum(topic) * 100
37
+ top_features_dict[topic_idx] = {"features": list(top_features), "weights": list(np.round(weights, 4))}
38
+
39
+ max_weight = max(max_weight, weights.max())
40
+
41
+ # subplot position
42
+ r = topic_idx // cols + 1
43
+ c = topic_idx % cols + 1
44
+
45
+ fig.add_trace(
46
+ go.Bar(
47
+ x=weights,
48
+ y=top_features,
49
+ orientation="h",
50
+ marker=dict(color=bar_color, line=dict(color="white", width=1)),
51
+ text=[f"{w:.2f}" for w in weights],
52
+ textposition="outside",
53
+ hovertemplate="<b>%{y}</b><br>weight=%{x:.2f}%<extra></extra>",
54
+ showlegend=False
55
+ ),
56
+ row=r, col=c
57
+ )
58
+
59
+ # nicer y ordering (largest at top)
60
+ fig.update_yaxes(autorange="reversed", row=r, col=c)
61
+
62
+ # Set x-axis range with padding for all subplots
63
+ for r_idx in range(1, rows + 1):
64
+ for c_idx in range(1, cols + 1):
65
+ fig.update_xaxes(
66
+ range=[0, max_weight * 1.25], # Add 25% padding for text labels
67
+ row=r_idx,
68
+ col=c_idx
69
+ )
70
+
71
+ # Axes labels for the bottom row
72
+ for c in range(1, cols + 1):
73
+ fig.update_xaxes(title_text="Relative Weight (%)", row=rows, col=c)
74
+
75
+ fig.update_layout(
76
+ title=f"<b>{title}</b>",
77
+ height=max(350, 330 * rows),
78
+ margin=dict(l=50, r=20, t=60, b=60)
79
+ )
80
+
81
+ st.session_state.top_topics = top_features_dict
82
+
83
+ return fig
utils/icons.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ICONS = {
2
+ "git-merge": """
3
+ <svg xmlns="http://www.w3.org/2000/svg" width="{size}" height="{size}" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
4
+ <circle cx="18" cy="18" r="3"></circle>
5
+ <circle cx="6" cy="6" r="3"></circle>
6
+ <path d="M6 9v6c0 2.2 1.8 4 4 4h4"></path>
7
+ <path d="m18 9-6-6"></path>
8
+ </svg>
9
+ """,
10
+ "scatter-chart": """
11
+ <svg xmlns="http://www.w3.org/2000/svg" width="{size}" height="{size}" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
12
+ <path d="M21 21H3V3"></path>
13
+ <circle cx="10" cy="10" r="1"></circle>
14
+ <circle cx="17" cy="17" r="1"></circle>
15
+ <circle cx="7" cy="17" r="1"></circle>
16
+ <circle cx="17" cy="7" r="1"></circle>
17
+ </svg>
18
+ """,
19
+ "link": """
20
+ <svg xmlns="http://www.w3.org/2000/svg" width="{size}" height="{size}" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
21
+ <path d="M10 13a5 5 0 0 0 7.54.54l3-3a5 5 0 0 0-7.07-7.07l-1.72 1.71"></path>
22
+ <path d="M14 11a5 5 0 0 0-7.54-.54l-3 3a5 5 0 0 0 7.07 7.07l1.71-1.71"></path>
23
+ </svg>
24
+ """,
25
+ "layout-dashboard": """
26
+ <svg xmlns="http://www.w3.org/2000/svg" width="{size}" height="{size}" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
27
+ <rect width="7" height="9" x="3" y="3" rx="1"></rect>
28
+ <rect width="7" height="5" x="14" y="3" rx="1"></rect>
29
+ <rect width="7" height="9" x="14" y="12" rx="1"></rect>
30
+ <rect width="7" height="5" x="3" y="16" rx="1"></rect>
31
+ </svg>
32
+ """,
33
+ "table": """
34
+ <svg xmlns="http://www.w3.org/2000/svg" width="{size}" height="{size}" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
35
+ <path d="M12 3v18"></path>
36
+ <rect width="18" height="18" x="3" y="3" rx="2"></rect>
37
+ <path d="M3 9h18"></path>
38
+ <path d="M3 15h18"></path>
39
+ </svg>
40
+ """,
41
+ "info": """
42
+ <svg xmlns="http://www.w3.org/2000/svg" width="{size}" height="{size}" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
43
+ <circle cx="12" cy="12" r="10"></circle>
44
+ <path d="M12 16v-4"></path>
45
+ <path d="M12 8h.01"></path>
46
+ </svg>
47
+ """,
48
+ "calculator": """
49
+ <svg xmlns="http://www.w3.org/2000/svg" width="{size}" height="{size}" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
50
+ <rect width="16" height="20" x="4" y="2" rx="2"></rect>
51
+ <line x1="8" x2="16" y1="6" y2="6"></line>
52
+ <line x1="12" x2="12" y1="10" y2="18"></line>
53
+ <line x1="8" x2="16" y1="14" y2="14"></line>
54
+ </svg>
55
+ """,
56
+ "target": """
57
+ <svg xmlns="http://www.w3.org/2000/svg" width="{size}" height="{size}" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
58
+ <circle cx="12" cy="12" r="10"></circle>
59
+ <circle cx="12" cy="12" r="6"></circle>
60
+ <circle cx="12" cy="12" r="2"></circle>
61
+ </svg>
62
+ """,
63
+ "tag": """
64
+ <svg xmlns="http://www.w3.org/2000/svg" width="{size}" height="{size}" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
65
+ <path d="M12 2H2v10l9.29 9.29c.94.94 2.48.94 3.42 0l6.58-6.58c.94-.94.94-2.48 0-3.42L12 2Z"></path>
66
+ <path d="M7 7h.01"></path>
67
+ </svg>
68
+ """,
69
+ "tags": """
70
+ <svg xmlns="http://www.w3.org/2000/svg" width="{size}" height="{size}" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
71
+ <path d="M9 5H2v7l6.29 6.29c.94.94 2.48.94 3.42 0l3.58-3.58"></path>
72
+ <path d="M13.29 17.71L21 10V3h-7l-6.29 6.29c-.94.94-.94 2.48 0 3.42l3.58 3.58"></path>
73
+ <path d="M7 7h.01"></path>
74
+ <path d="M15 15h.01"></path>
75
+ </svg>
76
+ """,
77
+ "trending-up": """
78
+ <svg xmlns="http://www.w3.org/2000/svg" width="{size}" height="{size}" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
79
+ <polyline points="22 7 13.5 15.5 8.5 10.5 2 17"></polyline>
80
+ <polyline points="16 7 22 7 22 13"></polyline>
81
+ </svg>
82
+ """,
83
+ "search": """
84
+ <svg xmlns="http://www.w3.org/2000/svg" width="{size}" height="{size}" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
85
+ <circle cx="11" cy="11" r="8"></circle>
86
+ <path d="m21 21-4.3-4.3"></path>
87
+ </svg>
88
+ """,
89
+ "hash": """
90
+ <svg xmlns="http://www.w3.org/2000/svg" width="{size}" height="{size}" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
91
+ <line x1="4" x2="20" y1="9" y2="9"></line>
92
+ <line x1="4" x2="20" y1="15" y2="15"></line>
93
+ <line x1="10" x2="8" y1="3" y2="21"></line>
94
+ <line x1="16" x2="14" y1="3" y2="21"></line>
95
+ </svg>
96
+ """,
97
+ "alert-triangle": """
98
+ <svg xmlns="http://www.w3.org/2000/svg" width="{size}" height="{size}" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
99
+ <path d="m21.73 18-8-14a2 2 0 0 0-3.46 0l-8 14A2 2 0 0 0 4 21h16a2 2 0 0 0 1.73-3Z"></path>
100
+ <path d="M12 9v4"></path>
101
+ <path d="M12 17h.01"></path>
102
+ </svg>
103
+ """,
104
+ "bar-chart-2": """
105
+ <svg xmlns="http://www.w3.org/2000/svg" width="{size}" height="{size}" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
106
+ <line x1="18" x2="18" y1="20" y2="10"></line>
107
+ <line x1="12" x2="12" y1="20" y2="4"></line>
108
+ <line x1="6" x2="6" y1="20" y2="14"></line>
109
+ </svg>
110
+ """,
111
+ "bar-chart": """
112
+ <svg xmlns="http://www.w3.org/2000/svg" width="{size}" height="{size}" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
113
+ <line x1="12" x2="12" y1="20" y2="10"></line>
114
+ <line x1="18" x2="18" y1="20" y2="4"></line>
115
+ <line x1="6" x2="6" y1="20" y2="16"></line>
116
+ </svg>
117
+ """
118
+ }
119
+
120
+ def lucide_icon(name: str, size: int = 18) -> str:
121
+ """
122
+ Return an inline SVG or text placeholder for a Lucide icon.
123
+
124
+ Looks up the given icon name in the ICONS dictionary and returns
125
+ the corresponding SVG string, formatted with the requested size.
126
+ If the icon name is not found, falls back to a simple HTML <span>
127
+ displaying the icon name text at the same approximate size.
128
+
129
+ Args:
130
+ name (str): The Lucide icon name (e.g., "git-merge", "bar-chart-2").
131
+ size (int): Desired icon size in pixels. Applied to both width and height.
132
+
133
+ Returns:
134
+ str: HTML string containing either the SVG markup or a styled text placeholder.
135
+ """
136
+ svg = ICONS.get(name)
137
+ if svg:
138
+ return svg.format(size=size)
139
+ else:
140
+ safe = (name or "").replace("<", "&lt;").replace(">", "&gt;")
141
+ return f'<span style="font-size:{size}px; line-height:1; opacity:0.75;">{safe}</span>'
142
+
143
+
utils/load_data.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import streamlit as st
3
+ from pathlib import Path
4
+ import os
5
+
6
+ @st.cache_data(show_spinner="Loading data...⏳")
7
+ def load_dataset(path: str | Path, category: str | None=None):
8
+
9
+ DATA_OPTIONS = {
10
+ 'Beauty': 'All_Beauty.parquet',
11
+ 'Appliances': 'Appliances.parquet',
12
+ 'Baby Products': 'Baby_Products.parquet',
13
+ 'Electronics': 'Electronics.parquet',
14
+ 'Health and Household': 'Health_and_Household.parquet',
15
+ 'Movies and TV': 'Movies_and_TV.parquet'
16
+ }
17
+
18
+ if category:
19
+ data_path = os.path.join(path, DATA_OPTIONS[category])
20
+ df = pd.read_parquet(data_path)
21
+ else:
22
+ df = pd.read_parquet(path)
23
+
24
+ return df
utils/load_pred_model.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from pathlib import Path
3
+ import pandas as pd
4
+ import joblib
5
+ from huggingface_hub import hf_hub_download
6
+
7
+ # def get_root():
8
+ # # app_utils is inside streamlit_app, so parent is streamlit_app
9
+ # return Path(__file__).resolve().parent.parent
10
+
11
+ # # def get_model_path():
12
+ # # return get_root() / "models" / "sgdc_pipeline.joblib"
13
+
14
+ # def get_data_path():
15
+ # return get_root() / "models" / "demo_data.parquet"
16
+
17
+ # @st.cache_resource(show_spinner='Loading model')
18
+ # def load_model():
19
+ # path = get_model_path()
20
+ # if not path.exists():
21
+ # raise FileNotFoundError(f"Model file not found at: {path}")
22
+ # return joblib.load(path)
23
+
24
+ @st.cache_resource(show_spinner='Loading model') # Use this so it only downloads once per session
25
+ def load_model():
26
+ # Download the model file from your new Model Repo
27
+ model_path = hf_hub_download(
28
+ repo_id="tkbarb10/ads505-prediction-model",
29
+ filename="sgdc_pipeline.joblib"
30
+ )
31
+ # Load the model using joblib (or whatever library you used to save it)
32
+ return joblib.load(model_path)
33
+
34
+
35
+ @st.cache_data(show_spinner='Loading demo data...')
36
+ def load_demo_data():
37
+ # Download the parquet file from your Dataset Repo
38
+ file_path = hf_hub_download(
39
+ repo_id="tkbarb10/ads505-review-data",
40
+ repo_type="dataset",
41
+ filename="demo_data.parquet"
42
+ )
43
+ return pd.read_parquet(file_path)
utils/load_user_data.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import streamlit as st
3
+
4
+ def load_uploaded_file(uploaded_file) -> pd.DataFrame | None:
5
+ filename = uploaded_file.name.lower()
6
+
7
+ if filename.endswith(".csv"):
8
+ return pd.read_csv(uploaded_file)
9
+ elif filename.endswith(".parquet"):
10
+ return pd.read_parquet(uploaded_file)
11
+ else:
12
+ st.error("Unsupported file type. Please upload a CSV or Parquet file.")
13
+ return
utils/plot_gains.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import streamlit as st
3
+ import plotly.graph_objects as go #type: ignore
4
+
5
+ def plot_gains(y_true, y_probs):
6
+ # Build and sort dataframe
7
+ df = pd.DataFrame({
8
+ 'Actual': y_true,
9
+ 'Predicted': y_probs
10
+ }).sort_values(by='Predicted', ascending=False).reset_index(drop=True)
11
+
12
+ # Compute cumulative gain
13
+ df['Cumulative Percent'] = df['Actual'].cumsum() / df['Actual'].sum()
14
+ df['Percent of Data'] = (df.index + 1) / len(df)
15
+
16
+ # Compute K-stat (max distance from curve)
17
+ df['ks_stat'] = df['Cumulative Percent'] - df['Percent of Data']
18
+ ks_value = df['ks_stat'].max()
19
+ ks_idx = df['ks_stat'].idxmax()
20
+ cum_percent = df['Cumulative Percent'][ks_idx]
21
+ data_percent = df['Percent of Data'][ks_idx]
22
+
23
+ # Plotly figure
24
+ fig = go.Figure()
25
+
26
+ # Model Gains Curve
27
+ fig.add_trace(go.Scatter(
28
+ x=df['Percent of Data'],
29
+ y=df['Cumulative Percent'],
30
+ mode='lines',
31
+ name='Model Gains Curve',
32
+ line=dict(width=3)
33
+ ))
34
+
35
+ # Random baseline
36
+ fig.add_trace(go.Scatter(
37
+ x=[0, 1],
38
+ y=[0, 1],
39
+ mode='lines',
40
+ name='Random Baseline',
41
+ line=dict(width=2, dash='dash', color='gray')
42
+ ))
43
+
44
+ fig.add_annotation(
45
+ x=data_percent,
46
+ y=cum_percent,
47
+ text=f'Best Returns: {data_percent*100:.2f}%'
48
+ )
49
+
50
+ fig.update_layout(
51
+ title="Gains Curve",
52
+ xaxis_title="Percent of Data",
53
+ yaxis_title="Percent of Total Positive Cases Captured",
54
+ template="plotly_white",
55
+ height=450,
56
+ legend=dict(yanchor="bottom", y=0, xanchor="right", x=1)
57
+ )
58
+
59
+ st.session_state.ks_value = ks_value
60
+ st.session_state.peak_gains = round(data_percent * 100, 2)
61
+ st.session_state.percent_data = round(cum_percent * 100, 2)
62
+
63
+ return fig, data_percent, cum_percent
utils/prepare_user_dataframe.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import re
4
+ import nltk # type: ignore
5
+ from nltk.stem import WordNetLemmatizer # type: ignore
6
+ from nltk.corpus import wordnet # type: ignore
7
+ from nltk.tokenize import word_tokenize # type: ignore
8
+ from typing import Optional
9
+
10
+
11
+ # -------------------------------------------------------------------
12
+ # HTML cleaning and lemmatization helpers
13
+ # -------------------------------------------------------------------
14
+
15
+
16
+ def remove_user_html_tags(text: str) -> str:
17
+ """Remove basic HTML entities/tags and lowercase the text.
18
+
19
+ This preserves the original behavior used when training the model.
20
+ """
21
+ if text is None:
22
+ return ""
23
+
24
+ # Replace common HTML entities with their corresponding characters
25
+ text = text.replace('&#34;', '"') # Replace "
26
+ text = text.replace('&quot;', '"') # Also replace the named entity for "
27
+ text = text.replace('&apos;', "'") # Replace '
28
+ text = text.replace('&#39;', "'") # Also replace the numeric entity for '
29
+ text = text.replace('&amp;', '&') # Replace &
30
+ text = text.replace('<br />', ' ') # Replace line breaks with a space
31
+ text = text.replace('<br>', ' ') # Also handle <br>
32
+
33
+ # Use regex to remove any remaining HTML tags (e.g., <p>, <div>, <span>)
34
+ clean_text = re.sub(r'<[^>]+>', '', text)
35
+
36
+ return clean_text.lower()
37
+
38
+
39
+ def get_wordnet_pos(treebank_tag: str) -> str:
40
+ """Converts treebank POS tags to WordNet POS tags."""
41
+ if treebank_tag.startswith('J'):
42
+ return wordnet.ADJ
43
+ elif treebank_tag.startswith('V'):
44
+ return wordnet.VERB
45
+ elif treebank_tag.startswith('N'):
46
+ return wordnet.NOUN
47
+ elif treebank_tag.startswith('R'):
48
+ return wordnet.ADV
49
+ else:
50
+ # Default to noun if the tag is not recognized
51
+ return wordnet.NOUN
52
+
53
+
54
+ def lemmatize_user_text(text: str) -> str:
55
+ """Tokenizes, POS-tags, and lemmatizes a string of text."""
56
+ if not isinstance(text, str):
57
+ text = "" if text is None else str(text)
58
+
59
+ lemmatizer = WordNetLemmatizer()
60
+
61
+ # 1. Tokenize the text into words
62
+ tokens = word_tokenize(text)
63
+
64
+ # 2. Get the part-of-speech tag for each token
65
+ tagged_tokens = nltk.pos_tag(tokens)
66
+
67
+ # 3. Lemmatize each word with its corresponding POS tag
68
+ lemmatized_output = []
69
+ for word, tag in tagged_tokens:
70
+ pos = get_wordnet_pos(tag)
71
+ lemma = lemmatizer.lemmatize(word, pos=pos)
72
+ lemmatized_output.append(lemma)
73
+
74
+ return " ".join(lemmatized_output)
75
+
76
+
77
+ def prep_text_column(
78
+ df: pd.DataFrame,
79
+ text_col: str,
80
+ lemma_col: str,
81
+ overwrite: bool = False,
82
+ ) -> pd.DataFrame:
83
+ """
84
+ Column-agnostic helper to clean HTML and create a lemma column.
85
+
86
+ - If lemma_col already exists and overwrite=False, we return df unchanged.
87
+ - Otherwise we copy df and do the expensive cleaning + lemmatization.
88
+ """
89
+ # ✅ Fast path: if lemma already exists and we don't want to recompute, just reuse it
90
+ if lemma_col in df.columns and not overwrite:
91
+ return df
92
+ else:
93
+ # Only now do we copy and do heavy work
94
+ df_out = df.copy()
95
+
96
+ if text_col not in df_out.columns:
97
+ raise KeyError(f"Column '{text_col}' not found in dataframe.")
98
+
99
+ df_out[text_col] = (
100
+ df_out[text_col]
101
+ .fillna("")
102
+ .apply(remove_user_html_tags)
103
+ .astype(str)
104
+ .str.strip()
105
+ )
106
+
107
+ df_out[lemma_col] = df_out[text_col].apply(lemmatize_user_text)
108
+
109
+ return df_out
110
+
111
+
112
+
113
+ # -------------------------------------------------------------------
114
+ # Internal text prep for prediction
115
+ # -------------------------------------------------------------------
116
+
117
+
118
+ @st.cache_data(show_spinner='Prepping data!')
119
+ def _prep_user_text(df: pd.DataFrame) -> pd.DataFrame:
120
+ """Prepare core text columns for the prediction model.
121
+
122
+ This function:
123
+ - Ensures HTML cleaning + lemmatization for:
124
+ * 'text' -> 'lemma_text'
125
+ * 'review_title' -> 'lemma_title'
126
+ - Ensures the length features:
127
+ * 'Review Length'
128
+ * 'Title Length'
129
+
130
+ It is safe to call even if some of these columns already exist; in that case,
131
+ lemmatization is skipped and only length features are added if needed.
132
+ """
133
+ work_df = df.copy()
134
+
135
+ # Only lemmatize if the lemma columns are missing
136
+ if 'lemma_text' not in work_df.columns:
137
+ work_df = prep_text_column(work_df, text_col='text', lemma_col='lemma_text')
138
+
139
+ if 'lemma_title' not in work_df.columns:
140
+ work_df = prep_text_column(work_df, text_col='review_title', lemma_col='lemma_title')
141
+
142
+ # Ensure length features (only create if missing)
143
+ if 'Review Length' not in work_df.columns:
144
+ work_df['Review Length'] = work_df['text'].fillna('').apply(len)
145
+
146
+ if 'Title Length' not in work_df.columns:
147
+ work_df['Title Length'] = work_df['review_title'].fillna('').apply(len)
148
+
149
+ return work_df
150
+
151
+
152
+ # -------------------------------------------------------------------
153
+ # Public entry point used by the Streamlit app
154
+ # -------------------------------------------------------------------
155
+
156
+
157
+ def prep_text_and_features(model, df: Optional[pd.DataFrame] = None) -> None:
158
+ """Run text prep and feature assembly, storing results in session_state.
159
+
160
+ Behavior:
161
+ - If `df` is None, uses `st.session_state.raw_df` (current app behavior).
162
+ - Checks that required columns are present for the predictive model.
163
+ - Ensures HTML+lemma for title and text, and creates:
164
+ * 'Review Length'
165
+ * 'Title Length'
166
+ * 'vote' (binary target: 1 if helpful_vote > 0 else 0)
167
+ - Builds the feature matrix X based on `model.feature_names_in_`:
168
+ ['lemma_title', 'lemma_text', 'images', 'Review Length', 'Title Length']
169
+ - Stores:
170
+ * prepped_df
171
+ * X
172
+ * true_y
173
+ * prep_done flag
174
+ * resets downstream prediction state
175
+ """
176
+
177
+ if df is None:
178
+ df = st.session_state.get('raw_df')
179
+
180
+ if df is None:
181
+ st.warning("Upload a dataframe first.")
182
+ return
183
+
184
+ # Make sure the core columns are present
185
+ required_cols = {'helpful_vote', 'review_title', 'text', 'images'}
186
+ missing = required_cols - set(df.columns)
187
+
188
+ if missing:
189
+ st.error(
190
+ "The uploaded dataframe is missing required columns: "
191
+ + ", ".join(sorted(missing))
192
+ )
193
+ return
194
+
195
+ # Core text prep (HTML + lemma + length features)
196
+ prepped = _prep_user_text(df)
197
+
198
+ # Create binary target
199
+ prepped["vote"] = prepped["helpful_vote"].apply(lambda x: 1 if x > 0 else 0)
200
+
201
+ # Assemble features expected by the predictive model
202
+ # Your model expects:
203
+ # 'lemma_title', 'lemma_text', 'images', 'Review Length', 'Title Length'
204
+ # We still respect model.feature_names_in_ for robustness.
205
+ feature_cols = list(getattr(model, "feature_names_in_", [])) or [
206
+ "lemma_title",
207
+ "lemma_text",
208
+ "images",
209
+ "Review Length",
210
+ "Title Length",
211
+ ]
212
+
213
+ # Keep only columns that actually exist
214
+ feature_cols = [c for c in feature_cols if c in prepped.columns]
215
+
216
+ if not feature_cols:
217
+ st.error(
218
+ "No valid feature columns found for the model. Expected something like: "
219
+ "lemma_title, lemma_text, images, Review Length, Title Length."
220
+ )
221
+ return
222
+
223
+ X = prepped[feature_cols]
224
+ true_y = prepped["vote"]
225
+
226
+ # Store in session_state for downstream use
227
+ st.session_state.prepped_df = prepped
228
+ st.session_state.X = X
229
+ st.session_state.true_y = true_y
230
+ st.session_state.prep_done = True
231
+
232
+ # Reset downstream state if re-prepping
233
+ st.session_state.probs = None
234
+ st.session_state.model_run = False
utils/remove_html.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ def remove_html_tags(text):
4
+ # Replace common HTML entities with their corresponding characters
5
+ text = text.replace('&#34;', '"') # Replace "
6
+ text = text.replace('&quot;', '"') # Also replace the named entity for "
7
+ text = text.replace('&apos;', "'") # Replace '
8
+ text = text.replace('&#39;', "'") # Also replace the numeric entity for '
9
+ text = text.replace('&amp;', '&') # Replace &
10
+ text = text.replace('<br />', ' ') # Replace line breaks with a space
11
+ text = text.replace('<br>', ' ') # Also handle <br>
12
+
13
+ # Use regex to remove any remaining HTML tags (e.g., <p>, <div>, <span>)
14
+ clean_text = re.sub(r'<[^>]+>', '', text)
15
+
16
+ return clean_text.lower()
utils/topically.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import List, Optional, Tuple
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+ from sklearn.compose import ColumnTransformer
9
+ from sklearn.decomposition import NMF
10
+ from sklearn.feature_extraction.text import TfidfVectorizer
11
+ from sklearn.pipeline import make_pipeline, Pipeline
12
+ from utils.build_plotly import _build_topic_figure
13
+
14
+ import plotly.graph_objects as go # type: ignore
15
+
16
+ import streamlit as st
17
+ from nltk.corpus import stopwords # type: ignore
18
+
19
+ from utils.remove_html import remove_html_tags
20
+
21
+ # --------- Defaults / Paths ---------
22
+ # ROOT = Path(__file__).resolve().parents[1]
23
+ # DEFAULT_DATA_DIR = ROOT / "review_data"
24
+
25
+ from huggingface_hub import snapshot_download
26
+
27
+ @st.cache_resource
28
+ def get_data_directory():
29
+ # This downloads the whole review_data folder from your Dataset repo
30
+ data_path = snapshot_download(
31
+ repo_id="tkbarb10/ads505-review-data",
32
+ repo_type="dataset"
33
+ )
34
+ return Path(data_path) / "review_data"
35
+
36
+ DEFAULT_DATA_DIR = get_data_directory()
37
+
38
+ COLOR_WHEEL = {
39
+ "All_Beauty": "#d946ef", # magenta-ish
40
+ "Appliances": "#800000", # maroon
41
+ "Baby_Products": "#87ceeb", # skyblue
42
+ "Electronics": "#ffd700", # gold
43
+ "Health_and_Household": "#3cb371", # mediumseagreen
44
+ "Movies_and_TV": "#663399" # rebeccapurple
45
+ }
46
+
47
+ # Build stopword list (don’t mutate across calls)
48
+ BASE_STOPWORDS = set(stopwords.words("english"))
49
+ CUSTOM_KEEP = {
50
+ 'not','no','but','ain','don',"don't",'aren',"aren't",'couldn',"couldn't",
51
+ 'didn',"didn't",'doesn',"doesn't",'hadn',"hadn't",'hasn',"hasn't",'haven',
52
+ "haven't",'isn',"isn't",'mightn',"mightn't",'mustn',"mustn't",'needn',
53
+ "needn't",'shan',"shan't",'shouldn',"shouldn't",'wasn',"wasn't",'weren',
54
+ "weren't",'won',"won't",'wouldn',"wouldn't",'very','too'
55
+ }
56
+ DEFAULT_STOPWORDS = sorted(list(BASE_STOPWORDS - CUSTOM_KEEP))
57
+
58
+
59
+ # --------- Data loading / modeling ---------
60
+ def _load_category_df(
61
+ data_dir: Path | str,
62
+ category: str,
63
+ lemmatize: bool,
64
+ nrows: int
65
+ ) -> pd.DataFrame:
66
+ """Load parquet for category; choose lemma or raw; basic cleaning."""
67
+ data_dir = Path(data_dir)
68
+ path = data_dir / f"{category}.parquet"
69
+ lemma_path = data_dir / f"lemma_data/{category}.parquet"
70
+
71
+ if lemmatize:
72
+ df = pd.read_parquet(lemma_path)
73
+ else:
74
+ df = pd.read_parquet(path)
75
+ if "text" in df.columns:
76
+ df["text"] = df["text"].astype(str).str.strip().apply(remove_html_tags)
77
+
78
+ return df.iloc[:nrows, :].copy()
79
+
80
+
81
+ #@st.cache_data(show_spinner="One moment please!", show_time=True)
82
+ def make_topics(
83
+ category: str,
84
+ topic_columns: str,
85
+ lemmatize: bool,
86
+ n1: int,
87
+ n2: int,
88
+ n_components: int,
89
+ rating: Optional[List[int]] = None,
90
+ helpful_vote: Optional[int] = None,
91
+ new_words: Optional[List[str]] = None,
92
+ n_top_words: int = 5,
93
+ data_dir: Optional[str | Path] = None,
94
+ nrows: int = 10_000
95
+ ) -> Tuple[ColumnTransformer | Pipeline, go.Figure]:
96
+ """
97
+ Fit TF-IDF + NMF topic model and return (pipeline, Plotly figure).
98
+
99
+ Returns:
100
+ (topic_pipeline, fig)
101
+ """
102
+ data_dir = data_dir or DEFAULT_DATA_DIR
103
+ df = _load_category_df(data_dir, category, lemmatize, nrows=nrows)
104
+
105
+ # Optional filters
106
+ if rating is not None and "rating" in df.columns:
107
+ df = df[df["rating"].isin(rating)]
108
+ if helpful_vote is not None and "helpful_vote" in df.columns:
109
+ df = df[df["helpful_vote"] > helpful_vote]
110
+
111
+ # Columns to model
112
+ topic_columns = (topic_columns or "").strip().lower()
113
+ # Make a fresh stopword list each call to avoid global mutation
114
+ stop_list = list(DEFAULT_STOPWORDS)
115
+ if new_words:
116
+ stop_list.extend(new_words)
117
+
118
+ tfidf_text = TfidfVectorizer(stop_words=stop_list, ngram_range=(n1, n2))
119
+ tfidf_title = TfidfVectorizer(stop_words=stop_list, ngram_range=(n1, n2))
120
+
121
+ if topic_columns == "both":
122
+ preprocessor = ColumnTransformer([
123
+ ("title", tfidf_title, "title"),
124
+ ("text", tfidf_text, "text")
125
+ ])
126
+ elif topic_columns == "text":
127
+ preprocessor = ColumnTransformer([("text", tfidf_text, "text")])
128
+ else:
129
+ # default to title if not 'both' or 'text'
130
+ preprocessor = ColumnTransformer([("title", tfidf_title, "title")])
131
+
132
+ nmf = NMF(
133
+ n_components=n_components,
134
+ init="nndsvda",
135
+ solver="mu",
136
+ beta_loss=1,
137
+ random_state=10
138
+ )
139
+
140
+ topic_pipeline = make_pipeline(preprocessor, nmf)
141
+ # Fit on only the columns the preprocessor expects
142
+ fit_cols = [c for c in ["title", "text"] if c in df.columns]
143
+ topic_pipeline.fit(df[fit_cols])
144
+
145
+ feature_names = topic_pipeline[0].get_feature_names_out()
146
+ nmf_model: NMF = topic_pipeline[1]
147
+
148
+ # Choose color from map (fallback if category label differs)
149
+ bar_color = COLOR_WHEEL.get(category, "#184A90")
150
+
151
+ fig = _build_topic_figure(
152
+ model=nmf_model,
153
+ feature_names=feature_names,
154
+ n_top_words=n_top_words,
155
+ title=category,
156
+ n_components=n_components,
157
+ bar_color=bar_color
158
+ )
159
+
160
+ return topic_pipeline, fig
161
+
162
+
163
+
uv.lock ADDED
The diff for this file is too large to render. See raw diff