File size: 1,153 Bytes
362de84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
"""Utilities for processing uploaded documents."""

import io

try:
    from pypdf import PdfReader
except ImportError:
    from PyPDF2 import PdfReader


def read_uploaded_file(uploaded_file):
    """
    Read and process uploaded file (TXT or PDF).
    
    Args:
        uploaded_file: Streamlit UploadedFile object
        
    Returns:
        list: List of text chunks from the document
    """

    uploaded_file.seek(0)
    
    if uploaded_file.type == "application/pdf":
        return process_pdf(uploaded_file)
    else:
        return process_text(uploaded_file)


def process_pdf(uploaded_file):
    """Extract text from PDF file."""
    pdf_reader = PdfReader(io.BytesIO(uploaded_file.read()))
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text() + "\n"
    return split_into_chunks(text)


def process_text(uploaded_file):
    """Read text file."""
    text = uploaded_file.read().decode("utf-8")
    return split_into_chunks(text)


def split_into_chunks(text):
    """Split text into chunks by lines."""
    docs = text.split("\n")
    docs = [doc.strip() for doc in docs if doc.strip()]
    return docs