"""Utilities for processing uploaded documents.""" import io try: from pypdf import PdfReader except ImportError: from PyPDF2 import PdfReader def read_uploaded_file(uploaded_file): """ Read and process uploaded file (TXT or PDF). Args: uploaded_file: Streamlit UploadedFile object Returns: list: List of text chunks from the document """ uploaded_file.seek(0) if uploaded_file.type == "application/pdf": return process_pdf(uploaded_file) else: return process_text(uploaded_file) def process_pdf(uploaded_file): """Extract text from PDF file.""" pdf_reader = PdfReader(io.BytesIO(uploaded_file.read())) text = "" for page in pdf_reader.pages: text += page.extract_text() + "\n" return split_into_chunks(text) def process_text(uploaded_file): """Read text file.""" text = uploaded_file.read().decode("utf-8") return split_into_chunks(text) def split_into_chunks(text): """Split text into chunks by lines.""" docs = text.split("\n") docs = [doc.strip() for doc in docs if doc.strip()] return docs