Spaces:

MehulJ
/

document_analyzer

Sleeping

+from typing import Any, Optional
+from smolagents.tools import Tool
+import transformers
+import PyPDF2
+import io
+import requests
+class DocumentAnalyzer(Tool):
+    """
+    A tool that analyzes PDF documents and extracts key information.
+    """
+    name = "analyze_document"
+    description = "Analyzes a PDF document and extracts key information like summary and sentiment"
+    inputs = {'document_url': {'type': 'string', 'description': 'URL to a PDF document'}}
+    output_type = "object"
+    def __init__(self):
+        super().__init__()
+    def forward(self, document_url: str) -> dict:
+        """
+        Analyzes a PDF document and extracts key information.
+        Args:
+            document_url (str): URL to a PDF document
+        Returns:
+            dict: Contains summary, key points, and sentiment
+        """
+        import PyPDF2
+        import io
+        import requests
+        from transformers import pipeline
+        # Download the document
+        response = requests.get(document_url)
+        pdf_file = io.BytesIO(response.content)
+        # Extract text
+        reader = PyPDF2.PdfReader(pdf_file)
+        text = ""
+        for page in reader.pages:
+            text += page.extract_text()
+        # Summarize text
+        summarizer = pipeline("summarization", max_length=100)
+        summary = summarizer(text[:1024])[0]['summary_text']
+        # Sentiment analysis
+        sentiment_analyzer = pipeline("sentiment-analysis")
+        sentiment = sentiment_analyzer(text[:512])[0]
+        return {
+            "summary": summary,
+            "sentiment": sentiment['label'],
+            "confidence": sentiment['score']
+        }