Spaces:

npaleti2002
/

StudySense

Sleeping

App Files Files Community

npaleti2002 commited on Dec 1, 2025

Commit

7761e22

verified ·

1 Parent(s): f4e08b6

Create app.py

Browse files

Files changed (1) hide show

app.py +248 -0

app.py ADDED Viewed

	@@ -0,0 +1,248 @@

+import re
+from pathlib import Path
+import gradio as gr
+import numpy as np
+import pdfplumber
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+from transformers import pipeline
+# ---------- Models ----------
+summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
+# ---------- Global state (will be stored in gr.State) ----------
+# lecture_chunks, vectorizer, X_matrix will live in state
+# ---------- Helpers ----------
+def load_text_from_file(file_obj) -> str:
+    if file_obj is None:
+        return ""
+    path = Path(file_obj.name)
+    suffix = path.suffix.lower()
+    if suffix == ".pdf":
+        texts = []
+        with pdfplumber.open(file_obj) as pdf:
+            for page in pdf.pages:
+                page_text = page.extract_text() or ""
+                texts.append(page_text)
+        raw_text = "\n".join(texts)
+    elif suffix == ".txt":
+        raw_text = file_obj.read().decode("utf-8", errors="ignore")
+    else:
+        raise ValueError("Only .pdf and .txt files are supported.")
+    return clean_text(raw_text)
+def clean_text(text: str) -> str:
+    text = text.replace("\r", " ")
+    text = re.sub(r"\n+", "\n", text)
+    text = re.sub(r"[ \t]+", " ", text)
+    return text.strip()
+def chunk_text(text: str, chunk_words: int = 350, overlap_words: int = 50):
+    words = text.split()
+    chunks = []
+    start = 0
+    chunk_id = 1
+    while start < len(words):
+        end = start + chunk_words
+        chunk_words_list = words[start:end]
+        chunk_text_ = " ".join(chunk_words_list)
+        chunks.append(
+            {
+                "chunk_id": f"C{chunk_id}",
+                "text": chunk_text_,
+            }
+        )
+        chunk_id += 1
+        start = end - overlap_words
+    return chunks
+def build_retriever(chunks):
+    docs = [c["text"] for c in chunks]
+    vectorizer = TfidfVectorizer(
+        max_features=10000,
+        ngram_range=(1, 2),
+        min_df=1,
+    )
+    X = vectorizer.fit_transform(docs)
+    return vectorizer, X
+def generate_summary(text: str, max_words: int = 300) -> str:
+    if not text:
+        return "No text found in the uploaded file."
+    # Hugging Face summarization has a max token limit; we slice text roughly
+    # into smaller windows and summarize each, then summarize again.
+    # Keep it simple & fast.
+    max_chunk_chars = 2500
+    windows = []
+    start = 0
+    while start < len(text):
+        end = start + max_chunk_chars
+        windows.append(text[start:end])
+        start = end
+    partial_summaries = []
+    for w in windows[:3]:  # hard cap, don’t explode runtime
+        s = summarizer(
+            w,
+            max_length=180,
+            min_length=60,
+            do_sample=False,
+            truncation=True,
+        )[0]["summary_text"]
+        partial_summaries.append(s)
+    combined = " ".join(partial_summaries)
+    final = summarizer(
+        combined,
+        max_length=220,
+        min_length=80,
+        do_sample=False,
+        truncation=True,
+    )[0]["summary_text"]
+    return final
+def retrieve_chunks(question, chunks, vectorizer, X, top_k: int = 5):
+    if not chunks or vectorizer is None or X is None:
+        return []
+    q_vec = vectorizer.transform([question])
+    sims = cosine_similarity(q_vec, X)[0]
+    top_idx = np.argsort(-sims)[:top_k]
+    results = []
+    for rank, idx in enumerate(top_idx, start=1):
+        c = chunks[idx]
+        results.append(
+            {
+                "rank": rank,
+                "chunk_id": c["chunk_id"],
+                "text": c["text"],
+                "similarity": float(sims[idx]),
+            }
+        )
+    return results
+def answer_question(question, chunks, vectorizer, X):
+    if not question.strip():
+        return "Please enter a question.", ""
+    retrieved = retrieve_chunks(question, chunks, vectorizer, X, top_k=3)
+    if not retrieved:
+        return "Please upload and process a lecture first.", ""
+    context_text = "\n\n".join([r["text"] for r in retrieved])
+    try:
+        ans = qa_pipeline(
+            {
+                "question": question,
+                "context": context_text,
+            }
+        )
+        answer = ans.get("answer", "").strip()
+    except Exception as e:
+        answer = f"Error from QA model: {e}"
+    # Build a short “sources” string
+    source_info = "; ".join(
+        [f"{r['chunk_id']} (sim={r['similarity']:.3f})" for r in retrieved]
+    )
+    return answer, source_info
+# ---------- Gradio Callbacks ----------
+def process_lecture(file):
+    """
+    1. Read PDF/TXT
+    2. Chunk
+    3. Build retriever
+    4. Generate summary
+    Returns: summary, chunks, vectorizer, X
+    """
+    if file is None:
+        return "Please upload a lecture file.", [], None, None
+    try:
+        text = load_text_from_file(file)
+    except Exception as e:
+        return f"Error reading file: {e}", [], None, None
+    if len(text) < 100:
+        return "File text is too short or empty after extraction.", [], None, None
+    chunks = chunk_text(text, chunk_words=350, overlap_words=50)
+    vectorizer, X = build_retriever(chunks)
+    summary = generate_summary(text)
+    return summary, chunks, vectorizer, X
+def chat_fn(question, chunks, vectorizer, X):
+    answer, sources = answer_question(question, chunks, vectorizer, X)
+    if sources:
+        answer = f"{answer}\n\n_Sources: {sources}_"
+    return answer
+# ---------- Gradio UI ----------
+with gr.Blocks() as demo:
+    gr.Markdown("# 📚 Lecture Summarizer + Chatbot\nUpload a PDF/TXT lecture, get a summary, then ask questions about it.")
+    with gr.Row():
+        file_input = gr.File(label="Upload lecture (.pdf or .txt)")
+        process_btn = gr.Button("Process Lecture")
+    summary_box = gr.Textbox(
+        label="Lecture Summary",
+        lines=12,
+        interactive=False,
+    )
+    # State: saved across chat turns
+    chunks_state = gr.State([])
+    vectorizer_state = gr.State(None)
+    X_state = gr.State(None)
+    process_btn.click(
+        fn=process_lecture,
+        inputs=[file_input],
+        outputs=[summary_box, chunks_state, vectorizer_state, X_state],
+    )
+    gr.Markdown("## 💬 Chat with the Lecture")
+    with gr.Row():
+        question_box = gr.Textbox(label="Your Question")
+    answer_box = gr.Textbox(label="Answer", lines=6, interactive=False)
+    ask_btn = gr.Button("Ask")
+    ask_btn.click(
+        fn=chat_fn,
+        inputs=[question_box, chunks_state, vectorizer_state, X_state],
+        outputs=[answer_box],
+    )
+if __name__ == "__main__":
+    demo.launch()