Spaces:

mrsk1883
/

testing

Sleeping

App Files Files Community

testing / app.py

mrsk1883

Update app.py

c870fc7 about 2 years ago

raw

history blame contribute delete

2.89 kB

	import gradio as gr
	from PyPDF2 import PdfReader
	from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
	from gtts import gTTS
	from io import BytesIO
	import re
	import os

	model_name = "ArtifactAI/led_large_16384_arxiv_summarization"
	model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
	tokenizer = AutoTokenizer.from_pretrained(model_name)

	def extract_first_sentence(text):
	sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.\|\?)\s', text)
	if sentences:
	return sentences[0]
	else:
	return text

	def extract_abstract_and_summarize(pdf_file):
	try:
	with open(pdf_file, 'rb') as file:
	pdf_reader = PdfReader(file)
	abstract_text = ''

	for page_num in range(len(pdf_reader.pages)):
	page = pdf_reader.pages[page_num]
	text = page.extract_text()

	abstract_match = re.search(r'\bAbstract\b', text, re.IGNORECASE)

	if abstract_match:
	start_index = abstract_match.end()

	# Check for the next heading or section marker
	next_section_match = re.search(r'\b(?:Introduction\|Methodology\|Conclusion)\b', text[start_index:])

	if next_section_match:
	end_index = start_index + next_section_match.start()
	abstract_text = text[start_index:end_index]
	else:
	abstract_text = text[start_index:]

	break # Exit loop once abstract is found

	# Summarize the extracted abstract
	inputs = tokenizer(abstract_text, return_tensors="pt")
	outputs = model.generate(**inputs)
	summary = tokenizer.decode(outputs[0])

	# Extract only the first sentence
	summary_sentence = extract_first_sentence(summary)

	# Generate audio
	speech = gTTS(text=summary_sentence, lang="en")
	speech_bytes = BytesIO()
	speech.write_to_fp(speech_bytes)

	# Return individual output values
	return summary_sentence, speech_bytes.getvalue(), abstract_text.strip()

	except Exception as e:
	raise Exception(str(e))

	interface = gr.Interface(
	fn=extract_abstract_and_summarize,
	inputs=[gr.File(label="Upload PDF")],
	outputs=[gr.Textbox(label="Summary"), gr.Audio()],
	title="PDF Summarization & Audio Tool",
	description="""PDF Summarization App. This app extracts the abstract from a PDF, summarizes it in one sentence, and generates an audio of it. Only upload PDFs with abstracts.
	Please read the README.MD for information about the app and sample PDFs.""",
	examples=[os.path.join(os.path.dirname(__file__), "Article 11 Hidden Technical Debt in Machine Learning Systems.pdf")],
	cache_examples=True,
	)

	interface.launch(share=True)