npaleti2002 commited on
Commit
7761e22
·
verified ·
1 Parent(s): f4e08b6

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +248 -0
app.py ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from pathlib import Path
3
+
4
+ import gradio as gr
5
+ import numpy as np
6
+ import pdfplumber
7
+
8
+ from sklearn.feature_extraction.text import TfidfVectorizer
9
+ from sklearn.metrics.pairwise import cosine_similarity
10
+
11
+ from transformers import pipeline
12
+
13
+ # ---------- Models ----------
14
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
15
+ qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
16
+
17
+ # ---------- Global state (will be stored in gr.State) ----------
18
+ # lecture_chunks, vectorizer, X_matrix will live in state
19
+
20
+
21
+ # ---------- Helpers ----------
22
+ def load_text_from_file(file_obj) -> str:
23
+ if file_obj is None:
24
+ return ""
25
+ path = Path(file_obj.name)
26
+ suffix = path.suffix.lower()
27
+
28
+ if suffix == ".pdf":
29
+ texts = []
30
+ with pdfplumber.open(file_obj) as pdf:
31
+ for page in pdf.pages:
32
+ page_text = page.extract_text() or ""
33
+ texts.append(page_text)
34
+ raw_text = "\n".join(texts)
35
+ elif suffix == ".txt":
36
+ raw_text = file_obj.read().decode("utf-8", errors="ignore")
37
+ else:
38
+ raise ValueError("Only .pdf and .txt files are supported.")
39
+ return clean_text(raw_text)
40
+
41
+
42
+ def clean_text(text: str) -> str:
43
+ text = text.replace("\r", " ")
44
+ text = re.sub(r"\n+", "\n", text)
45
+ text = re.sub(r"[ \t]+", " ", text)
46
+ return text.strip()
47
+
48
+
49
+ def chunk_text(text: str, chunk_words: int = 350, overlap_words: int = 50):
50
+ words = text.split()
51
+ chunks = []
52
+ start = 0
53
+ chunk_id = 1
54
+
55
+ while start < len(words):
56
+ end = start + chunk_words
57
+ chunk_words_list = words[start:end]
58
+ chunk_text_ = " ".join(chunk_words_list)
59
+
60
+ chunks.append(
61
+ {
62
+ "chunk_id": f"C{chunk_id}",
63
+ "text": chunk_text_,
64
+ }
65
+ )
66
+
67
+ chunk_id += 1
68
+ start = end - overlap_words
69
+
70
+ return chunks
71
+
72
+
73
+ def build_retriever(chunks):
74
+ docs = [c["text"] for c in chunks]
75
+ vectorizer = TfidfVectorizer(
76
+ max_features=10000,
77
+ ngram_range=(1, 2),
78
+ min_df=1,
79
+ )
80
+ X = vectorizer.fit_transform(docs)
81
+ return vectorizer, X
82
+
83
+
84
+ def generate_summary(text: str, max_words: int = 300) -> str:
85
+ if not text:
86
+ return "No text found in the uploaded file."
87
+
88
+ # Hugging Face summarization has a max token limit; we slice text roughly
89
+ # into smaller windows and summarize each, then summarize again.
90
+ # Keep it simple & fast.
91
+ max_chunk_chars = 2500
92
+ windows = []
93
+ start = 0
94
+ while start < len(text):
95
+ end = start + max_chunk_chars
96
+ windows.append(text[start:end])
97
+ start = end
98
+
99
+ partial_summaries = []
100
+ for w in windows[:3]: # hard cap, don’t explode runtime
101
+ s = summarizer(
102
+ w,
103
+ max_length=180,
104
+ min_length=60,
105
+ do_sample=False,
106
+ truncation=True,
107
+ )[0]["summary_text"]
108
+ partial_summaries.append(s)
109
+
110
+ combined = " ".join(partial_summaries)
111
+ final = summarizer(
112
+ combined,
113
+ max_length=220,
114
+ min_length=80,
115
+ do_sample=False,
116
+ truncation=True,
117
+ )[0]["summary_text"]
118
+
119
+ return final
120
+
121
+
122
+ def retrieve_chunks(question, chunks, vectorizer, X, top_k: int = 5):
123
+ if not chunks or vectorizer is None or X is None:
124
+ return []
125
+
126
+ q_vec = vectorizer.transform([question])
127
+ sims = cosine_similarity(q_vec, X)[0]
128
+
129
+ top_idx = np.argsort(-sims)[:top_k]
130
+ results = []
131
+ for rank, idx in enumerate(top_idx, start=1):
132
+ c = chunks[idx]
133
+ results.append(
134
+ {
135
+ "rank": rank,
136
+ "chunk_id": c["chunk_id"],
137
+ "text": c["text"],
138
+ "similarity": float(sims[idx]),
139
+ }
140
+ )
141
+ return results
142
+
143
+
144
+ def answer_question(question, chunks, vectorizer, X):
145
+ if not question.strip():
146
+ return "Please enter a question.", ""
147
+
148
+ retrieved = retrieve_chunks(question, chunks, vectorizer, X, top_k=3)
149
+ if not retrieved:
150
+ return "Please upload and process a lecture first.", ""
151
+
152
+ context_text = "\n\n".join([r["text"] for r in retrieved])
153
+
154
+ try:
155
+ ans = qa_pipeline(
156
+ {
157
+ "question": question,
158
+ "context": context_text,
159
+ }
160
+ )
161
+ answer = ans.get("answer", "").strip()
162
+ except Exception as e:
163
+ answer = f"Error from QA model: {e}"
164
+
165
+ # Build a short “sources” string
166
+ source_info = "; ".join(
167
+ [f"{r['chunk_id']} (sim={r['similarity']:.3f})" for r in retrieved]
168
+ )
169
+
170
+ return answer, source_info
171
+
172
+
173
+ # ---------- Gradio Callbacks ----------
174
+ def process_lecture(file):
175
+ """
176
+ 1. Read PDF/TXT
177
+ 2. Chunk
178
+ 3. Build retriever
179
+ 4. Generate summary
180
+ Returns: summary, chunks, vectorizer, X
181
+ """
182
+ if file is None:
183
+ return "Please upload a lecture file.", [], None, None
184
+
185
+ try:
186
+ text = load_text_from_file(file)
187
+ except Exception as e:
188
+ return f"Error reading file: {e}", [], None, None
189
+
190
+ if len(text) < 100:
191
+ return "File text is too short or empty after extraction.", [], None, None
192
+
193
+ chunks = chunk_text(text, chunk_words=350, overlap_words=50)
194
+ vectorizer, X = build_retriever(chunks)
195
+ summary = generate_summary(text)
196
+
197
+ return summary, chunks, vectorizer, X
198
+
199
+
200
+ def chat_fn(question, chunks, vectorizer, X):
201
+ answer, sources = answer_question(question, chunks, vectorizer, X)
202
+ if sources:
203
+ answer = f"{answer}\n\n_Sources: {sources}_"
204
+ return answer
205
+
206
+
207
+ # ---------- Gradio UI ----------
208
+ with gr.Blocks() as demo:
209
+ gr.Markdown("# 📚 Lecture Summarizer + Chatbot\nUpload a PDF/TXT lecture, get a summary, then ask questions about it.")
210
+
211
+ with gr.Row():
212
+ file_input = gr.File(label="Upload lecture (.pdf or .txt)")
213
+ process_btn = gr.Button("Process Lecture")
214
+
215
+ summary_box = gr.Textbox(
216
+ label="Lecture Summary",
217
+ lines=12,
218
+ interactive=False,
219
+ )
220
+
221
+ # State: saved across chat turns
222
+ chunks_state = gr.State([])
223
+ vectorizer_state = gr.State(None)
224
+ X_state = gr.State(None)
225
+
226
+ process_btn.click(
227
+ fn=process_lecture,
228
+ inputs=[file_input],
229
+ outputs=[summary_box, chunks_state, vectorizer_state, X_state],
230
+ )
231
+
232
+ gr.Markdown("## 💬 Chat with the Lecture")
233
+
234
+ with gr.Row():
235
+ question_box = gr.Textbox(label="Your Question")
236
+ answer_box = gr.Textbox(label="Answer", lines=6, interactive=False)
237
+
238
+ ask_btn = gr.Button("Ask")
239
+
240
+ ask_btn.click(
241
+ fn=chat_fn,
242
+ inputs=[question_box, chunks_state, vectorizer_state, X_state],
243
+ outputs=[answer_box],
244
+ )
245
+
246
+
247
+ if __name__ == "__main__":
248
+ demo.launch()