| | import gradio as gr |
| | from PyPDF2 import PdfReader |
| | from transformers import AutoModelForSeq2SeqLM, AutoTokenizer |
| | from gtts import gTTS |
| | from io import BytesIO |
| | import re |
| | import os |
| |
|
| | model_name = "ArtifactAI/led_large_16384_arxiv_summarization" |
| | model = AutoModelForSeq2SeqLM.from_pretrained(model_name) |
| | tokenizer = AutoTokenizer.from_pretrained(model_name) |
| |
|
| | def extract_first_sentence(text): |
| | sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text) |
| | if sentences: |
| | return sentences[0] |
| | else: |
| | return text |
| |
|
| | def extract_abstract_and_summarize(pdf_file): |
| | try: |
| | with open(pdf_file, 'rb') as file: |
| | pdf_reader = PdfReader(file) |
| | abstract_text = '' |
| |
|
| | for page_num in range(len(pdf_reader.pages)): |
| | page = pdf_reader.pages[page_num] |
| | text = page.extract_text() |
| |
|
| | abstract_match = re.search(r'\bAbstract\b', text, re.IGNORECASE) |
| |
|
| | if abstract_match: |
| | start_index = abstract_match.end() |
| |
|
| | |
| | next_section_match = re.search(r'\b(?:Introduction|Methodology|Conclusion)\b', text[start_index:]) |
| |
|
| | if next_section_match: |
| | end_index = start_index + next_section_match.start() |
| | abstract_text = text[start_index:end_index] |
| | else: |
| | abstract_text = text[start_index:] |
| |
|
| | break |
| |
|
| | |
| | inputs = tokenizer(abstract_text, return_tensors="pt") |
| | outputs = model.generate(**inputs) |
| | summary = tokenizer.decode(outputs[0]) |
| |
|
| | |
| | summary_sentence = extract_first_sentence(summary) |
| |
|
| | |
| | speech = gTTS(text=summary_sentence, lang="en") |
| | speech_bytes = BytesIO() |
| | speech.write_to_fp(speech_bytes) |
| |
|
| | |
| | return summary_sentence, speech_bytes.getvalue(), abstract_text.strip() |
| |
|
| | except Exception as e: |
| | raise Exception(str(e)) |
| |
|
| | interface = gr.Interface( |
| | fn=extract_abstract_and_summarize, |
| | inputs=[gr.File(label="Upload PDF")], |
| | outputs=[gr.Textbox(label="Summary"), gr.Audio()], |
| | title="PDF Summarization & Audio Tool", |
| | description="""PDF Summarization App. This app extracts the abstract from a PDF, summarizes it in one sentence, and generates an audio of it. Only upload PDFs with abstracts. |
| | Please read the README.MD for information about the app and sample PDFs.""", |
| | examples=[os.path.join(os.path.dirname(__file__), "Article 11 Hidden Technical Debt in Machine Learning Systems.pdf")], |
| | cache_examples=True, |
| | ) |
| |
|
| | interface.launch(share=True) |