| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain.docstore.document import Document | |
| import os | |
| from rag import Rag | |
| pdf_folder_path = 'files' | |
| def get_documents_from_path(pdf_folder_path: str = pdf_folder_path) -> list: | |
| documents = [] | |
| for pdf_file in os.listdir(pdf_folder_path): | |
| if pdf_file.endswith('.pdf'): | |
| loader = PyPDFLoader(os.path.join(pdf_folder_path, pdf_file)) | |
| pdf_documents = loader.load() | |
| file_name_without_extension = os.path.splitext(pdf_file)[0] | |
| for doc in pdf_documents: | |
| documents.append(Document(page_content=doc.page_content, metadata={"source": file_name_without_extension})) | |
| return documents | |
| if __name__ == "__main__": | |
| try: | |
| rag_llm = Rag() | |
| documents = get_documents_from_path() | |
| rag_llm.storeDocumentsInVectorstore(documents) | |
| print("Store PDFS Completed") | |
| except Exception as e: | |
| print(e) |