RuntimePirate commited on
Commit
3b270cd
·
verified ·
1 Parent(s): 09aaf1a

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +233 -0
app.py ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import re
4
+ from typing import List, Tuple
5
+
6
+ import streamlit as st
7
+ from transformers import pipeline
8
+ import PyPDF2
9
+ import nltk
10
+
11
+ @st.cache_resource
12
+ def download_nltk():
13
+ nltk.download("punkt", quiet=True)
14
+
15
+ download_nltk()
16
+
17
+ with st.sidebar:
18
+ st.image(
19
+ "https://raw.githubusercontent.com/Runtimepirate/About_me/main/Profile_pic.jpg",
20
+ width=200,
21
+ )
22
+ st.markdown(
23
+ "## **Mr. Aditya Katariya [[Resume](https://drive.google.com/file/d/1Vq9-H1dl5Kky2ugXPIbnPvJ72EEkTROY/view?usp=drive_link)]**"
24
+ )
25
+ st.markdown(" *College - Noida Institute of Engineering and Technology, U.P*")
26
+ st.markdown("----")
27
+ st.markdown("## Contact Details:-")
28
+ st.markdown("📫 *[Prasaritation@gmail.com](mailto:Prasaritation@gmail.com)*")
29
+ st.markdown("💼 *[LinkedIn](https://www.linkedin.com/in/adityakatariya/)*")
30
+ st.markdown("💻 *[GitHub](https://github.com/Runtimepirate)*")
31
+ st.markdown("----")
32
+ st.markdown("**AI & ML Enthusiast**")
33
+ st.markdown(
34
+ "Passionate about solving real-world problems using data science and customer analytics. Always learning and building smart, scalable AI solutions."
35
+ )
36
+ st.markdown("----")
37
+ mode = st.radio("Choose Mode:", ["Ask Anything", "Challenge Me"], key="mode")
38
+
39
+ st.title("📚 Document‑Aware Assistant")
40
+
41
+ st.markdown(
42
+ """
43
+ This assistant **reads your uploaded PDF or TXT document**, produces a *≤150‑word* summary, answers your questions with paragraph‑level justification, **generates logic‑based questions**, and evaluates your responses.
44
+ """
45
+ )
46
+
47
+ @st.cache_resource(show_spinner=True)
48
+ def load_models():
49
+ """Load all required Hugging Face pipelines once and reuse."""
50
+ summarizer = pipeline(
51
+ "summarization",
52
+ model="facebook/bart-large-cnn",
53
+ device_map="auto",
54
+ )
55
+ qa = pipeline(
56
+ "question-answering",
57
+ model="deepset/roberta-base-squad2",
58
+ device_map="auto",
59
+ )
60
+ qg = pipeline(
61
+ "text2text-generation",
62
+ model="valhalla/t5-small-qg-hl",
63
+ device_map="auto",
64
+ max_length=64,
65
+ )
66
+ return summarizer, qa, qg
67
+
68
+ summarizer, qa_pipeline, qg_pipeline = load_models()
69
+
70
+ def extract_text_from_pdf(uploaded_file: io.BytesIO) -> str:
71
+ reader = PyPDF2.PdfReader(uploaded_file)
72
+ text = ""
73
+ for page in reader.pages:
74
+ page_text = page.extract_text()
75
+ if page_text:
76
+ text += page_text + "\n"
77
+ return text
78
+
79
+ def extract_text(uploaded_file) -> str:
80
+ if uploaded_file.name.lower().endswith(".pdf"):
81
+ return extract_text_from_pdf(uploaded_file)
82
+ elif uploaded_file.name.lower().endswith(".txt"):
83
+ return uploaded_file.read().decode("utf-8", errors="ignore")
84
+ return ""
85
+
86
+ def chunk_text(text: str, max_tokens: int = 450) -> List[str]:
87
+ """Split text into roughly max_tokens‑sized chunks using sentences."""
88
+ sentences = nltk.sent_tokenize(text)
89
+ chunks: List[str] = []
90
+ current: List[str] = []
91
+ token_count = 0
92
+
93
+ for sent in sentences:
94
+ num_tokens = len(sent.split())
95
+ if token_count + num_tokens > max_tokens and current:
96
+ chunks.append(" ".join(current))
97
+ current = []
98
+ token_count = 0
99
+ current.append(sent)
100
+ token_count += num_tokens
101
+ if current:
102
+ chunks.append(" ".join(current))
103
+ return chunks
104
+
105
+ def get_best_answer(question: str, chunks: List[str]) -> Tuple[str, int, int, float, str]:
106
+ """Run QA over chunks, return best answer with its score and context chunk."""
107
+ best = {"score": -float("inf")}
108
+ for chunk in chunks:
109
+ try:
110
+ answer = qa_pipeline(question=question, context=chunk)
111
+ if answer["score"] > best["score"] and answer["answer"].strip():
112
+ best = {
113
+ "answer": answer["answer"],
114
+ "score": answer["score"],
115
+ "start": answer["start"],
116
+ "end": answer["end"],
117
+ "context": chunk,
118
+ }
119
+ except Exception:
120
+ continue
121
+ if best["score"] == -float("inf"):
122
+ return "", 0, 0, 0.0, ""
123
+ return (
124
+ best["answer"],
125
+ best["start"],
126
+ best["end"],
127
+ best["score"],
128
+ best["context"],
129
+ )
130
+
131
+ def highlight_answer(context: str, start: int, end: int) -> str:
132
+ """Return context with the answer wrapped in **bold** for display."""
133
+ return (
134
+ context[:start]
135
+ + " **"
136
+ + context[start:end]
137
+ + "** "
138
+ + context[end:]
139
+ )
140
+
141
+ def generate_logic_questions(text: str, num_q: int = 3) -> List[str]:
142
+ """Generate num_q questions from the document using QG pipeline."""
143
+ sentences = nltk.sent_tokenize(text)
144
+ questions: List[str] = []
145
+ for sent in sentences:
146
+ if len(questions) >= num_q:
147
+ break
148
+ hl_text = f"<hl> {sent} <hl> "
149
+ try:
150
+ q = qg_pipeline(hl_text, do_sample=False, max_length=64)[0]["generated_text"]
151
+ q = q.strip().rstrip("?.!") + "?"
152
+ if q not in questions:
153
+ questions.append(q)
154
+ except Exception:
155
+ continue
156
+ default_q = [
157
+ "What is the main topic of the document?",
158
+ "Summarize the methodology described.",
159
+ "What are the key findings or conclusions?",
160
+ ]
161
+ while len(questions) < num_q:
162
+ questions.append(default_q[len(questions)])
163
+ return questions
164
+
165
+
166
+
167
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
168
+ uploaded = st.file_uploader("Upload PDF or TXT Document", type=["pdf", "txt"], key="uploader")
169
+
170
+ if uploaded:
171
+ doc_text = extract_text(uploaded)
172
+ st.session_state["doc_text"] = doc_text
173
+
174
+ st.subheader("🔎 Auto Summary (≤ 150 words)")
175
+ try:
176
+ summary = summarizer(
177
+ doc_text[:4096],
178
+ max_length=150,
179
+ min_length=30,
180
+ do_sample=False,
181
+ )[0]["summary_text"]
182
+ st.write(summary)
183
+ except Exception as e:
184
+ st.error(f"Summarization failed: {e}")
185
+
186
+ if "chunks" not in st.session_state:
187
+ st.session_state["chunks"] = chunk_text(doc_text)
188
+
189
+ if mode == "Ask Anything":
190
+ st.subheader("💬 Ask Anything")
191
+ question = st.text_input("Ask a question about the document:", key="user_question")
192
+ if st.button("Submit Question", key="submit_question") and question:
193
+ with st.spinner("Finding answer..."):
194
+ ans, start, end, score, context = get_best_answer(
195
+ question, st.session_state["chunks"]
196
+ )
197
+ if ans:
198
+ st.markdown(f"**Answer:** {ans}")
199
+ justification = highlight_answer(context, start, end)
200
+ st.caption(f"Justification: …{justification[:300]}…")
201
+ st.caption(
202
+ f"Confidence Score: {score:.3f} | Paragraph tokens: {len(context.split())}"
203
+ )
204
+ else:
205
+ st.warning("Sorry, I couldn't find an answer in the document.")
206
+
207
+ elif mode == "Challenge Me":
208
+ st.subheader("🎯 Challenge Me")
209
+ if "logic_questions" not in st.session_state:
210
+ st.session_state["logic_questions"] = generate_logic_questions(doc_text)
211
+ st.session_state["user_answers"] = ["" for _ in st.session_state["logic_questions"]]
212
+
213
+ for idx, q in enumerate(st.session_state["logic_questions"]):
214
+ st.text_input(f"Q{idx+1}: {q}", key=f"logic_q_{idx}")
215
+
216
+ if st.button("Submit Answers", key="submit_logic"):
217
+ st.markdown("----")
218
+ for idx, q in enumerate(st.session_state["logic_questions"]):
219
+ user_ans = st.session_state.get(f"logic_q_{idx}", "").strip()
220
+ correct, start, end, score, context = get_best_answer(
221
+ q, st.session_state["chunks"]
222
+ )
223
+ st.markdown(f"**Q{idx+1} Evaluation:**")
224
+ st.write(f"*Your Answer*: {user_ans or '—'}")
225
+ st.write(f"*Expected Answer*: {correct or 'Not found in document'}")
226
+ if correct:
227
+ justification = highlight_answer(context, start, end)
228
+ st.caption(f"Justification: …{justification[:300]}…")
229
+ st.caption(f"Confidence Score: {score:.3f}")
230
+ st.markdown("----")
231
+
232
+ else:
233
+ st.info("Please upload a PDF or TXT document to begin.")